#include "peddata.h" PedData::PedData(){ MAXTOP = 20; snp_size = 0; sample_size = 0; chrom = 0; outfp = stdout; sigfp = stdout; // default threshold md_thresh = 0.4; r2_thresh = 1.0; } int PedData::run(ParamSet &ps){ fprintf(outfp, "****** Results of the ATRIUM Tests ******\n\n"); fprintf(stderr, "Loading phenotype data ... \n"); if(read_pheno(ps.phenofile)<0) return -1; fprintf(stderr, "Loading IBD coefficients ... \n"); if(load_ibdcoeff(ps.ibdfile)<0) return -1; fprintf(stderr, "Loading database ... \n"); if(load_db(ps.dbfile)<0) return -1; fprintf(stderr, "Loading genotype data ... \n"); if(read_geno(ps.genofile)<0) return -1; this->md_thresh = ps.md_thresh; this->r2_thresh = ps.r2_thresh; fprintf(stderr, "Loading parameter ... \n"); if(load_parameter(ps.parameterfile)<0) return -1; fprintf(outfp, "The prevalence value used in the ATRIUM calculations is %.6f.\n\n", prevalence); signal_scan(); return 1; } void PedData::signal_scan(){ fprintf(stderr, "Starting ATRIUM ... \n"); assign(); vector marker_list; for(int iter=0;iterr2_thresh) continue; printf("markers %d\n", iter+1); fprintf(outfp, "\n*****************************************************\n\n"); fprintf(outfp, "Target SNP: "); fprintf(outfp, "%s (%d on chr %d)\n", marker_info[iter].rsnumber.c_str(),marker_info[iter].position,chrom); fprintf(outfp, "Md = %.4f Maximum r^2 = %.4f", marker_info[iter].Md,marker_info[iter].mr2); fprintf(outfp, "\n\n"); fprintf(outfp, "*****************************************************\n\n"); // fprintf(outfp, "%d\t%s\t%d\t%d\t", iter+1,marker_info[iter].rsnumber.c_str(),marker_info[iter].onchip,marker_info[iter].position); nmark = marker_info[iter].anchor_num; // printout tag SNPs fprintf(outfp, "Tag SNPs\n"); fprintf(outfp, " rsnumber\t position\n"); vector seq; // order in data vector tseq; // order in marker_info map seq_map; // key:order in data value: order in tag // haplotype coding conversion for(int kk=0;kk a0v; vector a1v; for(int k=0;k trans_map; // key: haplotype value: reordered haplotype for(int kk=0;kk>1; } trans_map[code] = kk; } assign_gcode(iter); weight_table.clear(); weight_table = marker_info[iter].weight_table; hap_list.clear(); // fprintf(stderr, "Storing unique genotypes ... \n"); unique_gcode(); assign_info(); // fprintf(stderr, "Get EW estimator ... \n"); EW_a_estimator(); // fprintf(stderr, "Get CDW estimator ... \n"); CDW_a_estimator(); int flag = 1; map::iterator hiter = hap_freq_table.begin(); while(hiter!=hap_freq_table.end()){ if(hiter->second<0.00005){ // printf("The number of the distinct haplotypes is less\n"); flag = 0; break; } hiter++; } if(flag==0){ hiter = hap_freq_table.begin(); while(hiter!=hap_freq_table.end()){ if(hiter->second>=0.00005) hap_list[hiter->first] = 0; hiter++; } vector hpcontent; hiter = hap_list.begin(); while(hiter!=hap_list.end()){ char *pl = new char[nmark+1]; memset(pl,0,nmark+1); int h = hiter->first; for(int j=0;j>1; } hpcontent.push_back(pl); hiter++; } for(int i=0;i hpcontent; // fprintf(stderr, "Storing unique missing patten ... \n"); unique_mis_patten(hpcontent); // fprintf(stderr, "Get ATRIUM_a estimator ... \n"); ATRIUM_a_estimator(trans_map,a0v,a1v,iter); // fprintf(stderr, "Storing unique trio genotypes ... \n"); unique_trio_gcode(); // fprintf(stderr, "Storing unique trio missing patten ... \n"); unique_mis_trio_patten(hpcontent); // fprintf(stderr, "Get ATRIUM_b estimator ... \n"); ATRIUM_b_estimator(trans_map,a0v,a1v,iter); } fprintf(outfp, "\n"); // fprintf(stderr, "Clean heap ... \n\n"); mem_clean(); punit su; su.place = iter; su.p = p; marker_list.push_back(su); } int top = MAXTOP; if(MAXTOP >= marker_list.size()) top = marker_list.size(); fprintf(sigfp, "Top %d results for the ATRIUM test (list of the untyped SNPs having the %d smallest p-values among all untyped SNPs that were tested) \n\n", top,top); fprintf(sigfp, "**************************************************\n\n"); // printf("size of p is %d\n", marker_list.size()); std::sort(marker_list.begin(),marker_list.end(),inc_sort); for(int j=0;j fam_member; while(getline(phenodata,line)){ lc++; ins.clear(); ins.str(line); ins>>ws; if(ins.eof()) continue; if(ins >> fam_id >> ind_id >> father_id >> mother_id >> sex >> phenotype){ if(phenotype < 0 || phenotype > 2) fprintf(stderr, "individual %d from family %d does not have a phenotype value of 0, 1, or 2 ...skipped\n\n", ind_id,fam_id); else individual_list[ind_id] = -1; if(fam_id!=fam_old){ if(fam_old!=-1){ fam_list.push_back(new Family(fam_member,fam_old)); fam_member.clear(); } if(fam_map.find(fam_id)!=fam_map.end()){ fprintf(stderr, "family %d is not well ordered\n\n", fam_id); return 0; } else{ fam_map[fam_id] = fam_list.size(); fam_old=fam_id; } } int *a = new int[5]; memset(a,0,5*sizeof(int)); a[0] = ind_id; a[1] = father_id; a[2] = mother_id; a[3] = sex; a[4] = phenotype; fam_member.push_back(a); } } fam_list.push_back(new Family(fam_member,fam_old)); fam_member.clear(); for(int i=0;iAffected; n_unaff+=fam_list[i]->UnAffected; n_unknown+=fam_list[i]->Unknown; } sample_size=n_aff+n_unaff+n_unknown; fprintf(outfp, "There are %d individuals from %d independent families in the phenotype data file.\n", sample_size,fam_list.size()); fprintf(outfp, "Of these, %d are affected, %d are unaffected and %d are of unknown phenotype.\n\n", n_aff,n_unaff,n_unknown); return 1; } int PedData::load_db(char *db_file){ ifstream lib(db_file); istringstream ins; string line; int lc=0; // line number in the file //variable string rsnumber; int onchip; int pos; double temp; // max_md double md; double mr2; char allele0; char allele1; int model_size; // # of tag SNPs string ref_s; string weight_s; while(getline(lib,line)){ lc++; const char *line_str = line.c_str(); if(strstr(line_str,"MONOMORPHIC")!=0 || strstr(line_str,"NOINFO")!=0) continue; ins.clear(); ins.str(line); ins>>ws; // empty line, skip if(ins.eof()) continue; if(ins >> rsnumber >> onchip >> pos >> allele0 >> allele1 >> temp >> mr2 >> md >> model_size >> ref_s >> weight_s){ MarkerInfo mi; mi.rsnumber = rsnumber; mi.onchip = onchip; mi.position = pos; mi.order = -1; if(allele0>=97) // convert to capital letter allele0-=32; if(allele1>=97) allele1-=32; // direct assignment, no assumption about allele0 < allele1 mi.allele0 = allele0; mi.allele1 = allele1; vector anchor_list = tokenize(ref_s,string(":")); mi.anchor_list = anchor_list; // maximum r^2 mi.mr2 = mr2; mi.Md = md; mi.anchor_num = model_size; map weight_map; // key: haplotype, value: weight map hfreq_map; // weights vector w_list = tokenize(weight_s,string("_")); for(int k=0;k wsi = tokenize(w_list[k],string(":")); int code = atoi(wsi[0].c_str()); // haplotype hfreq_map[code] = atoi(wsi[1].c_str()); weight_map[code] = atof(wsi[3].c_str()); // weight } mi.weight_table = weight_map; mi.phap_freq_table = hfreq_map; index_map[rsnumber] = marker_info.size(); marker_info.push_back(mi); } else{ fprintf(stderr, "Error: Unexpected format in LD Database %s line %d\n", db_file,lc); return -1; } } return 1; } int PedData::load_ibdcoeff(char *ibdco_file){ int ind_1, ind_2; double coef8, coef7, coef6, coef5, coef4, coef3, coef2, coef1, coef0; int loc; map ind_fam; for(int i=0;iN;j++) ind_fam[fam_list[i]->fam_member[j][0]] = i; ifstream ibdcodata(ibdco_file); istringstream ins; string line; int lc=0; // line number in the file while(getline(ibdcodata,line)){ lc++; ins.clear(); ins.str(line); ins>>ws; if(ins.eof()) continue; if(ins >> ind_1 >> ind_2 >> coef8 >> coef7 >> coef6>> coef5 >> coef4 >> coef3 >> coef2 >> coef1 >> coef0){ if(ind_1==ind_2){ if(ind_fam.find(ind_1)==ind_fam.end()) fprintf(stderr, "Skip line %d in ibd coefficients file \"%s\"\n", lc,ibdco_file); else{ loc = ind_fam[ind_1]; if(coef8>0 || coef7>0 || coef6>0 || coef5>0 || coef4>0 || coef3>0 || coef1>0 || coef0>0 || fabs(coef2-1.0)>1e-8){ fprintf(stderr, "Skip line %d in ibd coefficients file \"%s\"\n", lc,ibdco_file); if(individual_list.find(ind_1)!=individual_list.end()) individual_list.erase(ind_1); } else fam_list[loc]->kinship_coeff[pair(ind_1,ind_2)] = 0; } } else{ if(ind_fam.find(ind_1)==ind_fam.end() || ind_fam.find(ind_2)==ind_fam.end()) fprintf(stderr, "Skip line %d in ibd coefficients file \"%s\"\n", lc,ibdco_file); else{ loc = ind_fam[ind_1]; if(ind_fam[ind_2]!=loc) fprintf(stderr, "Skip line %d in ibd coefficients file \"%s\"\n", lc,ibdco_file); else{ if(coef8>0 || coef7>0 || coef6>0 || coef5>0 || coef4>0 || coef3>0) fprintf(stderr, "Skip line %d in ibd coefficients file \"%s\"\n", lc,ibdco_file); else{ fam_list[loc]->IBD2_coeff[pair(ind_1,ind_2)] = coef2; fam_list[loc]->IBD1_coeff[pair(ind_1,ind_2)] = coef1; fam_list[loc]->IBD0_coeff[pair(ind_1,ind_2)] = coef0; fam_list[loc]->kinship_coeff[pair(ind_1,ind_2)] = coef2/2.0+coef1/4.0; fam_list[loc]->IBD2_coeff[pair(ind_2,ind_1)] = coef2; fam_list[loc]->IBD1_coeff[pair(ind_2,ind_1)] = coef1; fam_list[loc]->IBD0_coeff[pair(ind_2,ind_1)] = coef0; fam_list[loc]->kinship_coeff[pair(ind_2,ind_1)] = coef2/2.0+coef1/4.0; } } } } } } ind_fam.clear(); return 1; } int PedData::read_geno(char *geno_file){ string marker, alias; int chr, pos; char orientation, alleleA, alleleB; string geno; vector genotype_line; int ind_num, count=0; vector record; ifstream genodata(geno_file); istringstream ins; string line; int lc=0; // line number in the file if(getline(genodata,line)){ // get the first line ins.clear(); ins.str(line); ins>>ws; if(ins >> alias >> alias >> alias >> alias >> alias >> alias){ // rsnumber; chromosome; position; orientation; alleleA; alleleB while(ins >> ind_num){ if(individual_list.find(ind_num)!=individual_list.end()){ individual_list[ind_num] = count; count++; record.push_back(1); } else // individual is not in phenotype file record.push_back(0); } people_size = count; fprintf(outfp, "There are %d individuals listed in the genotype file, %d of whom are also in the phenotype file.\n\n", record.size(),people_size); } } while(getline(genodata,line)){ lc++; ins.clear(); ins.str(line); ins>>ws; if(ins.eof()) continue; if(ins >> marker >> chr >> pos >> orientation >> alleleA >> alleleB){ if(chrom == 0) chrom = chr; if(index_map.find(marker)!=index_map.end()){ int index = index_map[marker]; marker_info[index].onchip = 1; marker_info[index].order = snp_size; if(orientation == '+') marker_info[index].strand_switch = 0; else if(orientation == '-') marker_info[index].strand_switch = 1; marker_info[index].alleleA = alleleA; marker_info[index].alleleB = alleleB; // mapping map order and target_list order order2index[snp_size] = index; snp_size++; count = 0; while(ins >> geno){ if(record[count]==1) genotype_line.push_back(geno); count++; } if(count!=record.size()) printf("error in genofile\n"); if(people_size != genotype_line.size()){ fprintf(stderr, "Error: Unequal number of persons recorded in genotyo data file\n"); return -1; } string *cl = new string[genotype_line.size()]; for(int i=0;ida1){ char temp = da0; da0 = da1; da1 = temp; } int sw_flag = marker_info[index].strand_switch; map geno_code; geno_code.clear(); geno_code["NN"] = -1; geno_code["ND"] = -1; //check data/target coding system // da0 < da1 always true if((ta0 == da0 && ta1 == da1) || (ta0 == da1 && ta1 == da0)){ // A/T SNP; if(ta0 == 'A' && ta1 == 'T' && da0 == 'A' && da1 == 'T'){ if(sw_flag == 1){ geno_code["AA"] = 2; geno_code["TA"] = geno_code["AT"] = 1; geno_code["TT"] = 0; } else{ geno_code["AA"] = 0; geno_code["TA"] = geno_code["AT"] = 1; geno_code["TT"] = 2; } } else if(ta0 == 'T' && ta1 == 'A' && da0 == 'A' && da1 == 'T'){ if(sw_flag == 1){ geno_code["AA"] = 0; geno_code["TA"] = geno_code["AT"] = 1; geno_code["TT"] = 2; } else{ geno_code["AA"] = 2; geno_code["TA"] = geno_code["AT"] = 1; geno_code["TT"] = 0; } } // C/G SNP else if(ta0 == 'C' && ta1 == 'G' && da0 == 'C' && da1 == 'G'){ if(sw_flag == 1){ geno_code["CC"] = 2; geno_code["CG"] = geno_code["GC"] = 1; geno_code["GG"] = 0; } else{ geno_code["CC"] = 0; geno_code["CG"] = geno_code["GC"] = 1; geno_code["GG"] = 2; } } else if(ta0 == 'G' && ta1 == 'C' && da0 == 'C' && da1 == 'G'){ if(sw_flag == 1){ geno_code["CC"] = 0; geno_code["CG"] = geno_code["GC"] = 1; geno_code["GG"] = 2; } else{ geno_code["CC"] = 2; geno_code["CG"] = geno_code["GC"] = 1; geno_code["GG"] = 0; } } // not A/T or C/G SNPs else{ char AA[3]; char AB[3]; char BA[3]; char BB[3]; AA[2]=AB[2]=BA[2]=BB[2]=0; AA[0]=da0; AA[1]=da0; AB[0]=da0; AB[1]=da1; BA[0]=da1; BA[1]=da0; BB[0]=da1; BB[1]=da1; geno_code[string(AB)]=geno_code[string(BA)]=1; if(ta0 == da0){ geno_code[string(AA)] = 0; geno_code[string(BB)] = 2; } else if(ta0 == da1){ geno_code[string(AA)] = 2; geno_code[string(BB)] = 0; } } } // not equal else{ if(da0 == 'G' && da1 == 'T'){ if(ta0 == 'A' && ta1 == 'C'){ geno_code["TT"] = 0; geno_code["TG"] = geno_code["GT"] = 1; geno_code["GG"] = 2; } else if(ta0 =='C' && ta1 == 'A'){ geno_code["TT"] = 2; geno_code["TG"] = geno_code["GT"] = 1; geno_code["GG"] = 0; } } else if(da0 == 'C' && da1 == 'T'){ if(ta0 == 'A' && ta1 == 'G'){ geno_code["TT"] = 0; geno_code["CT"] = geno_code["TC"] = 1; geno_code["CC"] = 2; } else if(ta0 == 'G' && ta1 == 'A'){ geno_code["TT"] = 2; geno_code["CT"] = geno_code["TC"] = 1; geno_code["CC"] = 0; } } else if(da0 == 'A' && da1 == 'G'){ if(ta0 == 'C' && ta1 == 'T'){ geno_code["GG"] = 0; geno_code["AG"] = geno_code["GA"] = 1; geno_code["AA"] = 2; } else if(ta0 == 'T' && ta1 == 'C'){ geno_code["GG"] = 2; geno_code["AG"] = geno_code["GA"] = 1; geno_code["AA"] = 0; } } else if(da0 == 'A' && da1 == 'C'){ if(ta0 == 'G' && ta1 == 'T'){ geno_code["CC"] = 0; geno_code["AC"] = geno_code["CA"] = 1; geno_code["AA"] = 2; } else if(ta0 == 'T' && ta1 == 'G'){ geno_code["CC"] = 2; geno_code["AC"] = geno_code["CA"] = 1; geno_code["AA"] = 0; } } } int *data = new int[people_size]; for(int j=0;j>ws; if(ins >> kp) prevalence = kp; return 1; } void PedData::assign(){ for(int i=0;iset(); } void PedData::assign_gcode(int iter){ vector index; for(int i=0;iset_gcode(nmark); for(int j=0;jN;j++){ int ind_num = fam_list[i]->fam_member[j][0]; if(individual_list.find(ind_num)!=individual_list.end()){ int col = individual_list[ind_num]; if(col>=0){ for(int k=0;kset_gcode(gd,j); for(int k=0;kfam_member[j][4]==2) count_aff++; else if(fam_list[i]->fam_member[j][4]==1) count_unaff++; else if(fam_list[i]->fam_member[j][4]==0) count_unknown++; break; } } } } } fam_list[i]->fill_typed(); } delete[] gd; index.clear(); fprintf(outfp, "There are %d individuals with at least one tag SNP genotyped.\n", count); fprintf(outfp, "Of these, %d are affected, %d are unaffected and %d are of unknown phenotype. \n\n", count_aff,count_unaff,count_unknown); } void PedData::assign_info(){ for(int i=0;ifill_info_gcode(geno_list); } void PedData::unique_gcode(){ complete_geno_list.clear(); int count=0; int *gd = new int[nmark]; for(int k=0;k *complete_geno_code = geno->get_geno_code(); // complete-data genotypes count = complete_geno_code->size(); // printf("There are %d complete genotypes\n", count); for(int i=0;i > hv = complete_geno->get_hap_vec(); if(hv.size()>0){ complete_geno->compute_gname(); complete_geno_list.push_back(complete_geno); } else delete complete_geno; } delete geno; // printf("There are %d complete genotypes stored\n", complete_geno_list.size()); geno_list.clear(); map gmap; for(int i=0;iN;j++){ int code=0; for(int k=0;kgcode[j][k]; gd[k] = gc; code = 10*code+gc; } if(gmap.find(code)==gmap.end()){ gmap[code] = geno_list.size(); geno_list.push_back(new Genotype(gd,hap_list,nmark)); } fam_list[i]->set_cat(gmap[code],j); geno_list[gmap[code]]->increase_num(); } delete[] gd; /* for(int i=0;ishow_gstat(); printf("\n"); for(int i=0;ishow_gstat(); */ } void PedData::EW_a_estimator(){ HFEstimator est(fam_list,geno_list,hap_list,nmark); int count = 1; int flag; while(1){ map of = est.get_hap_freq(); est.EM_Iteration(0); map nf = est.get_hap_freq(); double diff = 0; flag = 1; for(int i=0;i100 || flag==0) break; count++; } if(flag==1) hap_freq_table = est.get_hap_freq(); // printf("It takes %d iterations.\n", count); // est.show_hap_freq(); } void PedData::CDW_a_estimator(){ // compute complete-data weight for(int i=0;icompute_weight(); // sum up for(int i=0;iN;j++){ int cat=fam_list[i]->cat[j]; geno_list[cat]->increase_count(fam_list[i]->weight[j]); } HFEstimator est(fam_list,geno_list,hap_list,nmark); est.reset_freq_table(hap_freq_table); int count = 1; int flag; while(1){ map of = est.get_hap_freq(); est.EM_Iteration(1); map nf = est.get_hap_freq(); double diff = 0; flag = 1; for(int i=0;i20 || flag==0) break; count++; } if(flag==1) hap_freq_table = est.get_hap_freq(); // printf("It takes %d iterations.\n", count); // est.show_hap_freq(); } void PedData::unique_mis_patten(const vector & hpcontent){ missing_list.clear(); map mis_map; for(int i=0;iN;j++){ int code=0; vector missing_site; for(int k=0;kgcode[j][k]; if(gc==-1){ code = 10*code+gc; missing_site.push_back(k); } else code*=10; } if(mis_map.find(code)==mis_map.end()){ mis_map[code] = missing_list.size(); missing_list.push_back(new Missing(missing_site,hap_list,hpcontent,nmark)); } fam_list[i]->set_mis_cat(mis_map[code],j); missing_list[mis_map[code]]->increase_num(); missing_site.clear(); } /* for(int i=0;idim,missing_list[i]->mapping_function.size()); printf("\n"); */ } void PedData::ATRIUM_a_estimator(map &trans_map, vector &a0v, vector &a1v, int loc){ for(int i=0;icompute_gname(); // IQLa IQLAEstimator est(fam_list,complete_geno_list,geno_list,missing_list,hap_list,nmark); est.set_output(outfp); est.reset_freq_table(hap_freq_table); int count = 1; int flag; est.allocate(); while(1){ map of = est.get_hap_freq(); est.NR_Iteration_new(); map nf = est.get_hap_freq(); double diff = 0; flag = 1; for(int i=0;i7 || flag==0) break; count++; } if(flag==1) hap_freq_table = est.get_hap_freq(); // printf("It takes %d iterations.\n", count); // est.show_hap_freq(trans_map,a0v,a1v); // compute test weight double mean = -prevalence/(1-prevalence); for(int i=0;icompute_test_weight(mean); hap_weight.clear(); double sum_1 = 0; double sum_2 = 0; map::iterator iter = hap_freq_table.begin(); while(iter!=hap_freq_table.end()){ int h = iter->first; if(weight_table.find(h)!=weight_table.end()){ sum_1 += (iter->second*weight_table[h]); sum_2 += (iter->second*(1-weight_table[h])); } iter++; } iter = hap_freq_table.begin(); while(iter!=hap_freq_table.end()){ int h = iter->first; if(weight_table.find(h)!=weight_table.end()) hap_weight[h] = weight_table[h]/sum_1-(1-weight_table[h])/sum_2; else hap_weight[h] = 0; iter++; } // association test // IQLS_a test est.Score_test_new(hap_weight); // est.show_test(); est.deallocate(); p = est.get_p_value(); stat = est.get_stat(); } void PedData::unique_trio_gcode(){ geno_trio_list.clear(); vector triolist; oneparent = 0; for(int i=0;iN;j++){ trio m; int id_1, id_2, ind_1, ind_2; int loc; int founder=0; int cat_1, cat_2; int typed_1, typed_2; int swi=0; m.cat_o = fam_list[i]->cat[j]; m.typed_o = fam_list[i]->typed[j]; ind_1 = fam_list[i]->fam_member[j][1]; ind_2 = fam_list[i]->fam_member[j][2]; if(ind_1==0 && ind_2==0){ // j is a founder m.cat_par1 = -1; m.typed_par1 = 0; m.cat_par2 = -1; m.typed_par2 = 0; founder = 1; } else{ id_1 = fam_list[i]->member_map[ind_1]; cat_1 = fam_list[i]->cat[id_1]; typed_1 = fam_list[i]->typed[id_1]; id_2 = fam_list[i]->member_map[ind_2]; cat_2 = fam_list[i]->cat[id_2]; typed_2 = fam_list[i]->typed[id_2]; if(typed_1==typed_2){ // 00 or 11 if(cat_1<=cat_2){ m.cat_par1 = cat_1; m.typed_par1 = typed_1; m.cat_par2 = cat_2; m.typed_par2 = typed_2; } else{ m.cat_par1 = cat_2; m.typed_par1 = typed_2; m.cat_par2 = cat_1; m.typed_par2 = typed_1; swi = 1; } } else if(typed_1set_trio_cat(geno_trio_list.size(),j); triolist.push_back(m); if(founder==1) geno_trio_list.push_back(new Genotypetrio(geno_list[m.cat_o],m.typed_o,0,0,nmark)); else geno_trio_list.push_back(new Genotypetrio(geno_list[m.cat_o],geno_list[m.cat_par1],geno_list[m.cat_par2],m.typed_o,m.typed_par1,m.typed_par2,nmark)); geno_trio_list[geno_trio_list.size()-1]->increase_num(); if(oneparent==0){ if((typed_1==1 && typed_2==0) || (typed_1==0 && typed_2==1)) oneparent = 1; } } else{ fam_list[i]->set_trio_cat(loc,j); geno_trio_list[loc]->increase_num(); } } } triolist.clear(); } void PedData::unique_mis_trio_patten(const vector & hpcontent){ missing_trio_list.clear(); vector mis_triolist; for(int i=0;iN;j++){ trio m; int id_1, id_2, ind_1, ind_2; int loc; int founder=0; int cat_1, cat_2; int typed_1, typed_2; int swi=0; m.cat_o = fam_list[i]->mis_cat[j]; m.typed_o = fam_list[i]->typed[j]; ind_1 = fam_list[i]->fam_member[j][1]; ind_2 = fam_list[i]->fam_member[j][2]; if(ind_1==0 && ind_2==0){ // j is a founder m.cat_par1 = -1; m.typed_par1 = 0; m.cat_par2 = -1; m.typed_par2 = 0; founder = 1; } else{ id_1 = fam_list[i]->member_map[ind_1]; cat_1 = fam_list[i]->mis_cat[id_1]; typed_1 = fam_list[i]->typed[id_1]; id_2 = fam_list[i]->member_map[ind_2]; cat_2 = fam_list[i]->mis_cat[id_2]; typed_2 = fam_list[i]->typed[id_2]; if(typed_1==typed_2){ // 00 or 11 if(cat_1<=cat_2){ m.cat_par1 = cat_1; m.typed_par1 = typed_1; m.cat_par2 = cat_2; m.typed_par2 = typed_2; } else{ m.cat_par1 = cat_2; m.typed_par1 = typed_2; m.cat_par2 = cat_1; m.typed_par2 = typed_1; swi = 1; } } else if(typed_1set_mis_trio_cat(missing_trio_list.size(),j); mis_triolist.push_back(m); if(founder==1) missing_trio_list.push_back(new Missingtrio(missing_list[m.cat_o],m.typed_o,0,0,hap_list,hpcontent,nmark)); else missing_trio_list.push_back(new Missingtrio(missing_list[m.cat_o],missing_list[m.cat_par1],missing_list[m.cat_par2],m.typed_o,m.typed_par1,m.typed_par2,hap_list,hpcontent,nmark)); missing_trio_list[missing_trio_list.size()-1]->increase_num(); } else{ fam_list[i]->set_mis_trio_cat(loc,j); missing_trio_list[loc]->increase_num(); } fam_list[i]->set_mis_par_order(swi,j); } } mis_triolist.clear(); } void PedData::ATRIUM_b_estimator(map &trans_map, vector &a0v, vector &a1v, int loc){ IQLBEstimator est(fam_list,geno_trio_list,missing_trio_list,hap_list,nmark,oneparent); est.set_output(outfp); est.reset_freq_table(hap_freq_table); int count = 1; int flag; est.allocate(); while(1){ map of = est.get_hap_freq(); est.NR_Iteration_new(); map nf = est.get_hap_freq(); double diff = 0; flag = 1; for(int i=0;i5 || flag==0) break; count++; } if(flag==1) hap_freq_table = est.get_hap_freq(); // printf("It takes %d iterations.\n", count); est.show_hap_freq(trans_map,a0v,a1v); est.estimate_subgroup(); // compute test trio weight double mean = -prevalence/(1-prevalence); for(int i=0;icompute_test_trio_weight(mean); hap_weight.clear(); double sum_1 = 0; double sum_2 = 0; double sum_hap = 0; map::iterator iter = hap_freq_table.begin(); while(iter!=hap_freq_table.end()){ int h = iter->first; if(weight_table.find(h)!=weight_table.end()){ sum_1 += (iter->second*weight_table[h]); sum_2 += (iter->second*(1-weight_table[h])); sum_hap += iter->second; } iter++; } sum_hap = sum_1/sum_hap; iter = hap_freq_table.begin(); while(iter!=hap_freq_table.end()){ int h = iter->first; if(weight_table.find(h)!=weight_table.end()) hap_weight[h] = weight_table[h]/sum_1-(1-weight_table[h])/sum_2; else hap_weight[h] = 0; iter++; } est.Score_test_new(hap_weight); est.show_test(stat); est.deallocate(); p = est.get_p_value(); fprintf(outfp, "\nAllele frequency estimates for the untyped SNP\n"); fprintf(outfp, " full sample "); if(est.aff>=15) fprintf(outfp, "affected "); if(est.unaff>=15) fprintf(outfp, "unaffected "); if(est.unknown>=15) fprintf(outfp, "unknown"); fprintf(outfp, "\n"); double sum_aff = 0; double sum_unaff = 0; double sum_unknown = 0; fprintf(outfp, "allele %c: freq = %.4f ", marker_info[loc].allele0,sum_hap); if(est.aff>=15){ sum_1 = 0; sum_aff = 0; iter = (est.aff_hap_freq_table).begin(); while(iter!=(est.aff_hap_freq_table).end()){ int h = iter->first; if(weight_table.find(h)!=weight_table.end()){ sum_1 += (iter->second*weight_table[h]); sum_aff += iter->second; } iter++; } sum_aff = sum_1/sum_aff; fprintf(outfp, "freq = %.4f ", sum_aff); } if(est.unaff>=15){ sum_1 = 0; sum_unaff = 0; iter = (est.unaff_hap_freq_table).begin(); while(iter!=(est.unaff_hap_freq_table).end()){ int h = iter->first; if(weight_table.find(h)!=weight_table.end()){ sum_1 += (iter->second*weight_table[h]); sum_unaff += iter->second; } iter++; } sum_unaff = sum_1/sum_unaff; fprintf(outfp, "freq = %.4f ", sum_unaff); } if(est.unknown>=15){ sum_1 = 0; sum_unknown = 0; iter = (est.unknown_hap_freq_table).begin(); while(iter!=(est.unknown_hap_freq_table).end()){ int h = iter->first; if(weight_table.find(h)!=weight_table.end()){ sum_1 += (iter->second*weight_table[h]); sum_unknown += iter->second; } iter++; } sum_unknown = sum_1/sum_unknown; fprintf(outfp, "freq = %.4f ", sum_unknown); } fprintf(outfp, "\n"); fprintf(outfp, "allele %c: freq = %.4f ", marker_info[loc].allele1,1-sum_hap); if(est.aff>=15) fprintf(outfp, "freq = %.4f ", 1-sum_aff); if(est.unaff>=15) fprintf(outfp, "freq = %.4f ", 1-sum_unaff); if(est.unknown>=15) fprintf(outfp, "freq = %.4f ", 1-sum_unknown); fprintf(outfp, "\n"); if(est.aff>0 && est.aff*(sum_aff<=1-sum_aff ? sum_aff : 1-sum_aff)<5) fprintf(outfp, "The p-value might not be exact because of the small number of minor alleles in affecteds\n"); if(est.unaff>=est.unknown){ if(est.unaff>0 && est.unaff*(sum_unaff<=1-sum_unaff ? sum_unaff : 1-sum_unaff)<5) fprintf(outfp, "The p-value might not be exact because of the small number of minor alleles in unaffecteds\n"); } else{ if(est.unknown>0 && est.unknown*(sum_unknown<=1-sum_unknown ? sum_unknown : 1-sum_unknown)<5) fprintf(outfp, "The p-value might not be exact because of the small number of minor alleles in unknowns\n"); } } void PedData::print_data(){ for(int i=0;iprint(); } void PedData::mem_clean(){ for(int i=0;idelete_gcode(); for(int i=0;i tokenize(const string& str, const string& delimiters){ vector tokens; // skip delimiters at beginning. string::size_type lastPos = str.find_first_not_of(delimiters,0); // find first "non-delimiter". string::size_type pos = str.find_first_of(delimiters,lastPos); while(string::npos != pos || string::npos != lastPos){ // found a token, add it to the vector. tokens.push_back(str.substr(lastPos,pos-lastPos)); // skip delimiters. Note the "not_of" lastPos = str.find_first_not_of(delimiters,pos); // find next "non-delimiter" pos = str.find_first_of(delimiters,lastPos); } return tokens; } int find_loc(const vector & s_vec, const trio & m) { for(int i=0;i