#include #include #include extern ofstream errorfs; #include #include #include #include "includes.h" #include "grid.h" #include "declare.h" /* This file contains source code for functions associated with reading and formatting the input */ //___________________________________________________________________________ // Read in all the datafile and pedfile data. int readinput(char* datafile, char* pedfile, int debug, int &program_code, int &data_type, int &num_loci, int &numblanks, int &num_markers, ivector &markers, int &map_type, int &map_size, dvector &map, int &num_total_affecteds, int &num_indiv, ivector &indiv_list, int &hap_or_freq, int &mkvorder, int &bayes, int &num_total_control, int &num_control, ivector &control_list, ivector &num_alleles, ivectors &allele_list, dvectors &allele_freq, dvector &mut_rates, int &est_p, double &p, double &max_res, int &map_res, int &max_cand, int &anc_hap_known, int &num_anc_hap_known, ivectors &anc_hap, int &C_loc, int &C_all, int &E_int, simpstrg& resout, simpstrg& ancout, simpstrg& maxout,simpstrg& oneout, int &L_End, int &R_End, double &fixed_thresh, double &var_thresh, pedvector &peds) { int i, j, k, check; ifstream dfs(datafile); if (!dfs) { errorfs << "Unable to read from datafile [" << datafile << "]\n" << endl; exit(1); } // int program_code; check = get_next(dfs, program_code); if (!check) check_error("program_code"); if (debug) print_var("program_code", program_code); if (!(program_code == 1 || program_code == 2 || program_code == 3)) { errorfs << "Invalid program code. Should be 1, 2 or 3.\n"; exit(1); } // int data_type; check = get_next(dfs, data_type); if (!check) check_error("data_type"); if (debug) print_var("data_type", data_type); if (!(data_type == 1 || data_type == 2)) { errorfs << "Invalid data type. Should be 1 or 2.\n"; exit(1); } //int num_loci; check = get_next(dfs, num_loci); if (!check) check_error("num_loci"); if (debug) print_var("num_loci", num_loci); // int numblanks; check = get_next(dfs, numblanks); if (!check) check_error("numblanks"); if (debug) print_var("numblanks", numblanks); if (numblanks < 0) { errorfs << "Invalid value for numblanks. Should be 0 or greater\n"; exit(1); } // int num_markers; check = get_next(dfs, num_markers); if (!check) check_error("num_markers"); if (debug) print_var("num_markers", num_markers); markers.init(num_markers); // ivector markers(num_markers); for (i = 0; i < num_markers; ++i) { check = get_next(dfs, markers[i]); if (!check) check_error("markers"); if (debug) print_var("markers", markers[i], i); } // int map_type; check = get_next(dfs, map_type); if (!check) check_error("map_type"); if (debug) print_var("map_type", map_type); if (!(map_type == 1 || map_type == 2 || map_type == 3)) { errorfs << "Invalid map type. Should be 1, 2 or 3.\n"; exit(1); } // int map_size; if (map_type == 1) map_size = num_markers - 1; else { if (map_type == 2) map_size = num_markers; else // map_type == 3 map_size = num_loci; } map.init(map_size); // dvector map(num_markers); for (i = 0; i < map_size; ++i) { check = get_next(dfs, map[i]); if (!check) check_error("map"); if (debug) print_var("map", map[i], i); } // int num_total_affecteds; check = get_next(dfs, num_total_affecteds); if (!check) check_error("num_total_affecteds"); if (debug) print_var("num_total_affecteds", num_total_affecteds); // int num_indiv; check = get_next(dfs, num_indiv); if (!check) check_error("num_indiv"); if (debug) print_var("num_indiv", num_indiv); // ivector indiv_list; indiv_list.init(num_indiv); for (i = 0; i < num_indiv; ++i) { check = get_next(dfs, indiv_list[i]); if (!check) check_error("indiv_list"); if (debug) print_var("indiv_list", indiv_list[i], i); } // int hap_or_freq; check = get_next(dfs, hap_or_freq); if (!check) check_error("hap_or_freq"); if (debug) print_var("hap_or_freq", hap_or_freq); if (hap_or_freq != 1 && hap_or_freq != 2 && hap_or_freq != 3 && hap_or_freq != 4) { errorfs << "Invalid control haplotypes / reported frequencies.\n"; errorfs << ". Should be 1, 2, 3, or 4.\n"; exit(1); } num_total_control = 0; num_control = 0; // int num_control = 0; // ivector control_list; // ivector num_alleles; // ivectors allele_list; // dvectors allele_freq; if (hap_or_freq <= 2) { check = get_next(dfs, mkvorder); if (!check) check_error("mkvorder"); if (debug) print_var("mkvorder", mkvorder); if (mkvorder !=1 && mkvorder !=2) { errorfs << "ERROR: mkvorder must be 1 or 2" << endl; exit(1); } check = get_next(dfs, bayes); if (!check) check_error("bayes"); if (debug) print_var("bayes", bayes); if (bayes !=0 && bayes !=1) { errorfs << "ERROR: bayes must be 0 or 1" << endl; exit(1); } check = get_next(dfs, num_total_control); if (!check) check_error("num_total_control"); if (debug) print_var("num_total_control", num_total_control); check = get_next(dfs, num_control); if (!check) check_error("num_control"); if (debug) print_var("num_control", num_control); control_list.init(num_control); for (i = 0; i < num_control; ++i) { check = get_next(dfs, control_list[i]); if (!check) check_error("control_list"); if (debug) print_var("control_list", control_list[i], i); } } else { int num_blank; if (hap_or_freq==3) num_blank=num_loci; else num_blank=num_markers; num_alleles.init(num_blank); allele_list.init(num_blank); allele_freq.init(num_blank); for (i = 0; i < num_blank; ++i) { check = get_next(dfs, num_alleles[i]); if (!check) check_error("num_alleles"); if (debug) print_var("num_alleles", num_alleles[i], i); if (num_alleles[i] < 0) { errorfs << "Error: Number of alleles cannot be negative\n"; exit(1); } allele_list[i].init(num_alleles[i]); for (j = 0; j < num_alleles[i]; j++) { check = get_next(dfs, allele_list[i][j]); if (!check) check_error("allele_list"); if (debug) print_var("allele_list", allele_list[i][j], i * 20 + j); } double freq_sum = 0; allele_freq[i].init(num_alleles[i]); for (j = 0; j < num_alleles[i]; ++j) { check = get_next(dfs, allele_freq[i][j]); if (!check) check_error("allele_freq"); if (debug) print_var("allele_freq", allele_freq[i][j], i * 20 + j); if (allele_freq[i][j] < 0) { errorfs << "Error: Allele frequencies cannot be negative\n"; exit(1); } freq_sum += allele_freq[i][j]; } if (fabs(freq_sum - 1.0) > .001) { errorfs << "Error: Allele frequencies for locus " << i << " do not sum to 1 ["<< freq_sum << "]\n"; exit(1); } } } mut_rates.init(num_loci); // dvector mut_rates(num_loci); for (i = 0; i < num_loci; ++i) { check = get_next(dfs, mut_rates[i]); if (!check) check_error("mut_rates"); if (debug) print_var("mut_rates", mut_rates[i], i); if (mut_rates[i] < 0) { errorfs << "Error: Mutation rates cannot be negative\n"; exit(1); } } p=0; // int est_p = 0; double p = 0; check = get_next(dfs, est_p); if (!check) check_error("est_p"); if (debug) print_var("est_p", est_p); if (!(est_p == 0 || est_p == 1)) { errorfs << "Error: Boolean est_p must be 0 or 1.\n"; exit(1); } check = get_next(dfs, p); if (!check) check_error("p"); if (debug) print_var("p", p); if (p < 0 || p > 1) { errorfs << "Error: Probability p must be between 0 and 1.\n"; exit(1); } // int map_res, max_cand, anc_hap_known; // ivector anc_hap; if (program_code == 1) { check = get_next(dfs, resout); if (!check) check_error("resout"); if (debug) print_var("resout", resout); check = get_next(dfs, ancout); if (!check) check_error("ancout"); if (debug) print_var("ancout", ancout); check = get_next(dfs, maxout); if (!check) check_error("maxout"); if (debug) print_var("maxout", maxout); check = get_next(dfs, oneout); if (!check) check_error("oneout"); if (debug) print_var("oneout", oneout); check = get_next(dfs, E_int); if (!check) check_error("E_int"); if (debug) print_var("E_int", E_int); if (E_int < 0 || num_markers-1 0.0) ) { errorfs << "Error: max_res must be greater than 0.\n"; exit(1); } check = get_next(dfs, map_res); if (!check) check_error("map_res"); if (debug) print_var("map_res", map_res); if (map_res <= 0) { errorfs << "Error: Mapping resolution must be a positive integer.\n"; exit(1); } check = get_next(dfs, max_cand); if (!check) check_error("max_cand"); if (debug) print_var("max_cand", max_cand); if (max_cand <= 0) { errorfs << "Error: Maximum # of ancestral haplotype candidates\n"; errorfs << " must be a positive integer.\n"; exit(1); } check = get_next(dfs, anc_hap_known); if (!check) check_error("anc_hap_known"); if (debug) print_var("anc_hap_known", anc_hap_known); if (anc_hap_known != 0 && anc_hap_known != 1 && anc_hap_known != 2) { errorfs << "Error: Boolean anc_hap_known variable should be 0 or 1.\n"; exit(1); } if (anc_hap_known==1 || anc_hap_known==2) { check = get_next(dfs, num_anc_hap_known); if (!check) check_error("num_anc_hap_known"); if (debug) print_var("num_anc_hap_known", num_anc_hap_known); if ( !(num_anc_hap_known > 0) ) { errorfs << "Error: num_anc_hap_known should be an integer >0.\n"; exit(1); } } if (anc_hap_known==0) { anc_hap.init(1); anc_hap[0].init(num_markers); anc_hap[0].clear(0); } else { anc_hap.init(num_anc_hap_known); for (j=0;j 1) { errorfs << "Error: var_thresh must be between 0 and 1.\n"; exit(1); } check = get_next(dfs, max_cand); if (!check) check_error("max_cand"); if (debug) print_var("max_cand", max_cand); if (max_cand < 0) { errorfs << "Error: Maximum # of ancestral haplotype candidates\n"; errorfs << " must be a nonnegative integer.\n"; errorfs << " (0 should be used for program code=2, unless\n"; errorfs << " an ancestral haplotype of full length is required)\n"; exit(1); } } if (program_code == 3) { check = get_next(dfs, C_loc); if (!check) check_error("C_loc"); if (debug) print_var("C_loc", C_loc); check = get_next(dfs, C_all); if (!check) check_error("C_all"); if (debug) print_var("C_all", C_all); check = get_next(dfs, L_End); if (!check) check_error("L_End"); if (debug) print_var("L_End", L_End); check = get_next(dfs, R_End); if (!check) check_error("R_End"); if (debug) print_var("R_End", R_End); anc_hap.init(1); anc_hap[0].init(num_markers); for (i = 0; i < num_markers; ++i) { check = get_next(dfs, anc_hap[0][i]); if (!check) check_error("anc_hap"); if (debug) print_var("anc_hap", anc_hap[0][i], i); if (anc_hap[i] < 0) { errorfs << "Error: Ancestral haplotypes must be postive integers.\n"; exit(1); } } } peds.init(num_total_affecteds + num_total_control); // pedvector peds(num_indiv + num_control); for (i = 0; i < peds.dim(); ++i) { peds[i].loci.init(num_loci); for (j = 0; j < num_loci; ++j) peds[i].loci[j].init(2); } read_ped_file(pedfile, numblanks, peds); if (debug) { cout << "pedfile\n"; for (i = 0; i < peds.dim(); ++i) { // if (pedfile_type == 2) //cout << peds[i].ped << " "; //cout << peds[i].id << " "; //if (pedfile_type == 2) { //cout << peds[i].fid << " "; //cout << peds[i].mid << " "; //cout << peds[i].sex << " "; //} //if (pedfile_type == 1 || pedfile_type == 2) //cout << peds[i].aff_con << " "; for (j = 0; j < peds[i].loci.dim(); ++j) for (k = 0; k < 2; ++k) cout << peds[i].loci[j][k] << " "; cout << endl; } } return 1; } // readinput //___________________________________________________________________________ int get_next(ifstream& dfs, int& nextvar) { //this function skips to the next integer char token[80]; int done = 0; int good = 0; int currpos = 0; while (!done && !dfs.eof()) { char curr = dfs.get(); if (isdigit(curr) || curr == '-') { token[currpos++] = curr; while (!dfs.eof() && isdigit(curr = dfs.get())) token[currpos++] = curr; if (!dfs.eof()) dfs.unget(); done = 1; good = 1; } else if (curr == '#') { while (!dfs.eof() && curr != '\n' && curr != '\r') curr = dfs.get(); } else if (!isspace(curr)) done = 1; } token[currpos] = '\0'; nextvar = atoi(token); return good; } // get_next(ifstream&, int&) //___________________________________________________________________________ int get_next(ifstream& dfs, simpstrg& nextvar) { //this function skips to the next character string int done = 0; int good = 0; while (!done && !dfs.eof()) { char curr = dfs.get(); if (curr == '#') { while (!dfs.eof() && curr != '\n' && curr != '\r') curr = dfs.get(); } else if (!isspace(curr)) { dfs.unget(); done = 1; good = 1; } } if (done && good) return (dfs >> nextvar ? 1 : 0); else return 0; } // get_next(ifstream&, simpstrg&) //___________________________________________________________________________ int skip_to_loci(ifstream& dfs, int numblanks) { //this function skips over a specified number of columns, used in reading //pedfile char curr; if (numblanks == 0) return 1; curr = dfs.get(); //cout << curr << endl; while ( (isspace(curr) || curr == '\n') && !dfs.eof() ) { curr = dfs.get(); //cout << curr << endl; } if (curr == '\n') return 0; for (int i = 0; i < numblanks; ++i) { while (!isspace(curr)) { curr = dfs.get(); //cout << curr << endl; } if (curr == '\n') return 0; while (isspace(curr)) { curr = dfs.get(); //cout << curr << endl; if (curr == '\n') return 0; } } dfs.unget(); return 1; } // skip_to_loci //___________________________________________________________________________ int get_next(ifstream& dfs, double& nextvar) { //reads in next double char token[80]; int done = 0; int good = 0; int currpos = 0; while (!done && !dfs.eof()) { char curr = dfs.get(); if (isdigit(curr) || toupper(curr) == 'E' || curr == '.' || curr == '-') { token[currpos++] = curr; while (!dfs.eof() && (isdigit(curr = dfs.get()) || curr == '-' || toupper(curr) == 'E' || curr == '.')) token[currpos++] = curr; if (!dfs.eof()) dfs.unget(); done = 1; good = 1; } else if (curr == '#') { while (!dfs.eof() && curr != '\n' && curr != '\r') curr = dfs.get(); } else if (!isspace(curr)) done = 1; } token[currpos] = '\0'; nextvar = atof(token); return good; } // get_next(ifstream&, double&) //___________________________________________________________________________ void print_var(const char* varname, double varval, int index) { if (index < 0) cout << varname << " = [" << varval << "]\n"; else cout << varname << "[" << index << "] = [" << varval << "]\n"; } // print_var(char*, int) //___________________________________________________________________________ void print_var(const char* varname, const simpstrg& varval, int index) { if (index < 0) cout << varname << " = [" << varval << "]\n"; else cout << varname << "[" << index << "] = [" << varval << "]\n"; } // print_var(char*, int) //___________________________________________________________________________ void check_error(const char* checkvar) { errorfs << "Error reading in variable [" << checkvar << "]\n"; exit(1); } //___________________________________________________________________________ int read_ped_file(const char* pedfile, int &numblanks, pedvector& peds) { int i, j, k, check; ifstream pfs(pedfile); if (!pfs) { errorfs << "Unable to read from pedfile [" << pedfile << "]\n" << endl; exit(1); } for (i = 0; i < peds.dim(); ++i) { //if (pedfile_type == 2) //pfs >> peds[i].ped; //pfs >> peds[i].id; //if (pedfile_type == 2) { //pfs >> peds[i].fid; //pfs >> peds[i].mid; //pfs >> peds[i].sex; //} //if (pedfile_type == 1 || pedfile_type == 2) pfs >> peds[i].aff_con; // //check = skip_to_loci(pfs, numblanks); //if (check==0) { //errorfs << "Error in skipping columns in pedfile\n"; //exit(1); } //for (j = 0; j < peds[i].loci.dim(); ++j) //for (k = 0; k < 2; ++k) //pfs >> peds[i].loci[j][k]; check = skip_to_loci(pfs, numblanks); if (check==0) { errorfs << "Error in skipping columns in pedfile\n"; exit(1); } for (j = 0; j < peds[i].loci.dim(); ++j) for (k = 0; k < 2; ++k) { check = get_next(pfs, peds[i].loci[j][k]); if (check==0) { errorfs << "Error in reading allele " << k << " (+1) at locus " << j << " (+1) for individual " << i << "(+1)" << endl; exit(1); } } } return 1; }