#include "peddata.h"


PedData::PedData(){
                   
  MAXTOP = 20;
  snp_size = 0;
  sample_size = 0;
  chrom = 0;
  outfp = stdout;
  sigfp = stdout;
  
  // default threshold
  md_thresh = 0.4;
  r2_thresh = 1.0;
}


int PedData::run(ParamSet &ps){
    
  fprintf(outfp, "****** Results of the ATRIUM Tests ******\n\n");

  fprintf(stderr, "Loading phenotype data ... \n");
  if(read_pheno(ps.phenofile)<0)
    return -1;

  fprintf(stderr, "Loading IBD coefficients ... \n");
  if(load_ibdcoeff(ps.ibdfile)<0)
    return -1;

  fprintf(stderr, "Loading database ... \n");
  if(load_db(ps.dbfile)<0)
    return -1;

  fprintf(stderr, "Loading genotype data ... \n");
  if(read_geno(ps.genofile)<0)
    return -1;

  this->md_thresh = ps.md_thresh;
  this->r2_thresh = ps.r2_thresh;

  fprintf(stderr, "Loading parameter ... \n");
  if(load_parameter(ps.parameterfile)<0)
    return -1;

  fprintf(outfp, "The prevalence value used in the ATRIUM calculations is %.6f.\n\n", prevalence);

  signal_scan();

  return 1;
}


void PedData::signal_scan(){
     
  fprintf(stderr, "Starting ATRIUM ... \n");

  assign();
  vector<punit> marker_list;

  for(int iter=0;iter<marker_info.size();iter++){
    if(marker_info[iter].anchor_num==1 || marker_info[iter].onchip==1 || marker_info[iter].Md<md_thresh || marker_info[iter].mr2>r2_thresh)
	  continue;
	  
    printf("markers %d\n", iter+1);

    fprintf(outfp, "\n*****************************************************\n\n");
    fprintf(outfp, "Target SNP: ");
    fprintf(outfp, "%s (%d on chr %d)\n", marker_info[iter].rsnumber.c_str(),marker_info[iter].position,chrom);
    fprintf(outfp, "Md = %.4f  Maximum r^2 = %.4f", marker_info[iter].Md,marker_info[iter].mr2);
    fprintf(outfp, "\n\n");
    fprintf(outfp, "*****************************************************\n\n");
    
//    fprintf(outfp, "%d\t%s\t%d\t%d\t", iter+1,marker_info[iter].rsnumber.c_str(),marker_info[iter].onchip,marker_info[iter].position);

    nmark = marker_info[iter].anchor_num;
    // printout tag SNPs
    fprintf(outfp, "Tag SNPs\n");
    fprintf(outfp, "   rsnumber\t     position\n");

    vector<int> seq;  // order in data
    vector<int> tseq;  // order in marker_info
    map<int, int> seq_map;  // key:order in data  value: order in tag
    // haplotype coding conversion
    for(int kk=0;kk<nmark;kk++){
      int ref_order = index_map[marker_info[iter].anchor_list[kk]];
      tseq.push_back(ref_order);
      seq.push_back(marker_info[ref_order].order);
      seq_map[marker_info[ref_order].order] = kk;
    }

    sort(seq.begin(),seq.end());
    sort(tseq.begin(),tseq.end());

    vector<char> a0v;
    vector<char> a1v;
  
    for(int k=0;k<tseq.size();k++){
      int di = tseq[k];
      fprintf(outfp, "   %s\t%d     \n",marker_info[di].rsnumber.c_str(), marker_info[di].position);
    
      a0v.push_back(marker_info[di].allele0);
      a1v.push_back(marker_info[di].allele1);
    }
    fprintf(outfp, "\n");

    int tot = 1<<seq.size();
  
    map<int, int> trans_map;  // key: haplotype  value: reordered haplotype
    for(int kk=0;kk<tot;kk++){
      int code = 0;
      int cc = kk;
      for(int m=0;m<seq.size();m++){
        int bit = cc%2;
        if(bit==1){
	      int bitpos = seq_map[seq[m]];
	      code += 1<<(bitpos);
        }
        cc = cc>>1;
      }
      trans_map[code] = kk;
    }

    assign_gcode(iter); 
    weight_table.clear();
    weight_table = marker_info[iter].weight_table;
    hap_list.clear();

//    fprintf(stderr, "Storing unique genotypes ... \n");
    unique_gcode();
    assign_info();

//    fprintf(stderr, "Get EW estimator ... \n");
    EW_a_estimator();

//    fprintf(stderr, "Get CDW estimator ... \n");
    CDW_a_estimator();

    int flag = 1;
    map<int, double>::iterator hiter = hap_freq_table.begin();
    while(hiter!=hap_freq_table.end()){
      if(hiter->second<0.00005){
//        printf("The number of the distinct haplotypes is less\n");
        flag = 0;
        break;
      }
      hiter++;
    }

    if(flag==0){
      hiter = hap_freq_table.begin();
      while(hiter!=hap_freq_table.end()){
        if(hiter->second>=0.00005)
          hap_list[hiter->first] = 0;
        hiter++;
      }

      vector<char *> hpcontent;
      hiter = hap_list.begin();
      while(hiter!=hap_list.end()){
        char *pl = new char[nmark+1];
	    memset(pl,0,nmark+1);
        int h = hiter->first;
        for(int j=0;j<nmark;j++){
          char bit = h%2;
          pl[j] = bit;
          h = h>>1;
        }
	    hpcontent.push_back(pl);
        hiter++;
      }

      for(int i=0;i<geno_list.size();i++)
        delete geno_list[i];
      geno_list.clear();

//      fprintf(stderr, "Storing unique genotypes ... \n");
      unique_gcode();
      assign_info();

//      fprintf(stderr, "Get EW estimator ... \n");
      EW_a_estimator();

//      fprintf(stderr, "Get CDW estimator ... \n");
      CDW_a_estimator();

//      fprintf(stderr, "Storing unique missing patten ... \n");
      unique_mis_patten(hpcontent);

//      fprintf(stderr, "Get ATRIUM_a estimator ... \n");
      ATRIUM_a_estimator(trans_map,a0v,a1v,iter);

//      fprintf(stderr, "Storing unique trio genotypes ... \n");
      unique_trio_gcode();

//      fprintf(stderr, "Storing unique trio missing patten ... \n");
      unique_mis_trio_patten(hpcontent);

//      fprintf(stderr, "Get ATRIUM_b estimator ... \n");
      ATRIUM_b_estimator(trans_map,a0v,a1v,iter);

      for(int i=0;i<hpcontent.size();i++)
	    delete[] hpcontent[i];
	  hpcontent.clear();
    }
    else{
	  vector<char *> hpcontent;
//      fprintf(stderr, "Storing unique missing patten ... \n");
      unique_mis_patten(hpcontent);

//      fprintf(stderr, "Get ATRIUM_a estimator ... \n");
      ATRIUM_a_estimator(trans_map,a0v,a1v,iter);

//      fprintf(stderr, "Storing unique trio genotypes ... \n");
      unique_trio_gcode();

//      fprintf(stderr, "Storing unique trio missing patten ... \n");
      unique_mis_trio_patten(hpcontent);

//      fprintf(stderr, "Get ATRIUM_b estimator ... \n");
      ATRIUM_b_estimator(trans_map,a0v,a1v,iter);
    }
    
    fprintf(outfp, "\n");

//    fprintf(stderr, "Clean heap ... \n\n");
    mem_clean();

    punit su;
    su.place = iter;
    su.p = p;
    marker_list.push_back(su);
  }

  int top = MAXTOP;
  if(MAXTOP >= marker_list.size())
    top = marker_list.size();

  fprintf(sigfp, "Top %d results for the ATRIUM test (list of the untyped SNPs having the %d smallest p-values among all untyped SNPs that were tested) \n\n", top,top);
  fprintf(sigfp, "**************************************************\n\n");

//  printf("size of p is %d\n", marker_list.size());
  std::sort(marker_list.begin(),marker_list.end(),inc_sort);

  for(int j=0;j<top;j++){
    int iter = marker_list[j].place;
    fprintf(sigfp, "SNP %s ", marker_info[iter].rsnumber.c_str());
    fprintf(sigfp, "has a p-value of %g\n", marker_list[j].p);
  }
}


int PedData::read_pheno(char *pheno_file){
    
  int fam_id, ind_id, father_id, mother_id, sex, phenotype;
  int fam_old = -1;
  int n_aff=0, n_unaff=0, n_unknown=0;

  ifstream phenodata(pheno_file);
  istringstream ins;
  string line;
  int lc=0;  // line number in the file

  vector<int *> fam_member;
  while(getline(phenodata,line)){
    lc++;
    ins.clear();
    ins.str(line);

    ins>>ws;
    if(ins.eof())
      continue;

    if(ins >> fam_id >> ind_id >> father_id >> mother_id >> sex >> phenotype){
      if(phenotype < 0 || phenotype > 2)
        fprintf(stderr, "individual %d from family %d does not have a phenotype value of 0, 1, or 2 ...skipped\n\n", ind_id,fam_id);
      else
        individual_list[ind_id] = -1;
        
      if(fam_id!=fam_old){
        if(fam_old!=-1){
          fam_list.push_back(new Family(fam_member,fam_old));
          fam_member.clear();
        }

        if(fam_map.find(fam_id)!=fam_map.end()){
          fprintf(stderr, "family %d is not well ordered\n\n", fam_id);
          return 0;
        }
        else{
          fam_map[fam_id] = fam_list.size();
          fam_old=fam_id;
        }
      }

      int *a = new int[5];
      memset(a,0,5*sizeof(int));
      a[0] = ind_id;
      a[1] = father_id;
      a[2] = mother_id;
      a[3] = sex;
      a[4] = phenotype;
      fam_member.push_back(a);  
    }
  }

  fam_list.push_back(new Family(fam_member,fam_old));
  fam_member.clear();
  
  for(int i=0;i<fam_list.size();i++){
    n_aff+=fam_list[i]->Affected;
    n_unaff+=fam_list[i]->UnAffected;
    n_unknown+=fam_list[i]->Unknown;
  }
  sample_size=n_aff+n_unaff+n_unknown;

  fprintf(outfp, "There are %d individuals from %d independent families in the phenotype data file.\n", sample_size,fam_list.size());
  fprintf(outfp, "Of these, %d are affected, %d are unaffected and %d are of unknown phenotype.\n\n", n_aff,n_unaff,n_unknown);

  return 1;
}


int PedData::load_db(char *db_file){
    
  ifstream lib(db_file);
  istringstream ins;
  string line;
  int lc=0;  // line number in the file

  //variable
  string rsnumber;
  int onchip;
  int pos;
  double temp;  // max_md
  double md;
  double mr2;
  char allele0;
  char allele1;
  int model_size;  // # of tag SNPs
  
  string ref_s;
  string weight_s;
  
  while(getline(lib,line)){
    lc++;
    const char *line_str = line.c_str();
    if(strstr(line_str,"MONOMORPHIC")!=0 || strstr(line_str,"NOINFO")!=0)
      continue;
 
    ins.clear();
    ins.str(line);

    ins>>ws;
    // empty line, skip
    if(ins.eof())
      continue;

    if(ins >> rsnumber >> onchip >> pos >> allele0 >> allele1 >> temp >> mr2 >> md >> model_size >> ref_s >> weight_s){
      MarkerInfo mi;    
      mi.rsnumber = rsnumber;
      mi.onchip = onchip;
      mi.position = pos;
      mi.order = -1;
      
      if(allele0>=97)  // convert to capital letter
	    allele0-=32;
      if(allele1>=97)
	    allele1-=32;
      
      // direct assignment, no assumption about allele0 < allele1
      mi.allele0 = allele0;
      mi.allele1 = allele1;
      
      vector<string> anchor_list = tokenize(ref_s,string(":"));
      mi.anchor_list = anchor_list;
      
      // maximum r^2 
      mi.mr2 = mr2;
      mi.Md = md;
      mi.anchor_num = model_size;

      map<int, double> weight_map;  // key: haplotype, value: weight
      map<int, int> hfreq_map;

      // weights
      vector<string> w_list = tokenize(weight_s,string("_"));
      for(int k=0;k<w_list.size();k++){
	    vector<string> wsi = tokenize(w_list[k],string(":"));
	    int code = atoi(wsi[0].c_str());  // haplotype
        hfreq_map[code] = atoi(wsi[1].c_str());
	    weight_map[code] = atof(wsi[3].c_str());  // weight
      }
      
      mi.weight_table = weight_map;
      mi.phap_freq_table = hfreq_map;

      index_map[rsnumber] = marker_info.size();
    
      marker_info.push_back(mi);
    }
    else{
      fprintf(stderr, "Error: Unexpected format in LD Database %s line %d\n", db_file,lc);
      return -1;
    }
  }

  return 1;
}


int PedData::load_ibdcoeff(char *ibdco_file){

  int ind_1, ind_2;
  double coef8, coef7, coef6, coef5, coef4, coef3, coef2, coef1, coef0;
  int loc;

  map<int, int> ind_fam;
  for(int i=0;i<fam_list.size();i++)
    for(int j=0;j<fam_list[i]->N;j++)
      ind_fam[fam_list[i]->fam_member[j][0]] = i;

  ifstream ibdcodata(ibdco_file);
  istringstream ins;
  string line;
  int lc=0;  // line number in the file

  while(getline(ibdcodata,line)){
    lc++;
    ins.clear();
    ins.str(line);

    ins>>ws;
    if(ins.eof())
      continue;

    if(ins >> ind_1 >> ind_2 >> coef8 >> coef7 >> coef6>> coef5 >> coef4 >> coef3 >> coef2 >> coef1 >> coef0){
      if(ind_1==ind_2){
        if(ind_fam.find(ind_1)==ind_fam.end())
          fprintf(stderr, "Skip line %d in ibd coefficients file \"%s\"\n", lc,ibdco_file);
        else{
          loc = ind_fam[ind_1];
          if(coef8>0 || coef7>0 || coef6>0 || coef5>0 || coef4>0 || coef3>0 || coef1>0 || coef0>0 || fabs(coef2-1.0)>1e-8){
            fprintf(stderr, "Skip line %d in ibd coefficients file \"%s\"\n", lc,ibdco_file);
            if(individual_list.find(ind_1)!=individual_list.end())
              individual_list.erase(ind_1);
          }
          else
            fam_list[loc]->kinship_coeff[pair<int, int>(ind_1,ind_2)] = 0;
        }
      }
      else{
        if(ind_fam.find(ind_1)==ind_fam.end() || ind_fam.find(ind_2)==ind_fam.end())
          fprintf(stderr, "Skip line %d in ibd coefficients file \"%s\"\n", lc,ibdco_file);
        else{
          loc = ind_fam[ind_1];
          if(ind_fam[ind_2]!=loc)
            fprintf(stderr, "Skip line %d in ibd coefficients file \"%s\"\n", lc,ibdco_file);
          else{
            if(coef8>0 || coef7>0 || coef6>0 || coef5>0 || coef4>0 || coef3>0)
              fprintf(stderr, "Skip line %d in ibd coefficients file \"%s\"\n", lc,ibdco_file);
            else{
              fam_list[loc]->IBD2_coeff[pair<int, int>(ind_1,ind_2)] = coef2;
              fam_list[loc]->IBD1_coeff[pair<int, int>(ind_1,ind_2)] = coef1;
              fam_list[loc]->IBD0_coeff[pair<int, int>(ind_1,ind_2)] = coef0;
              fam_list[loc]->kinship_coeff[pair<int, int>(ind_1,ind_2)] = coef2/2.0+coef1/4.0;
              fam_list[loc]->IBD2_coeff[pair<int, int>(ind_2,ind_1)] = coef2;
              fam_list[loc]->IBD1_coeff[pair<int, int>(ind_2,ind_1)] = coef1;
              fam_list[loc]->IBD0_coeff[pair<int, int>(ind_2,ind_1)] = coef0;
              fam_list[loc]->kinship_coeff[pair<int, int>(ind_2,ind_1)] = coef2/2.0+coef1/4.0;
            }
          }
        }
      }
    }
  }

  ind_fam.clear();

  return 1;
}


int PedData::read_geno(char *geno_file){
    
  string marker, alias;
  int chr, pos;
  char orientation, alleleA, alleleB;
  string geno;
  vector<string> genotype_line;

  int ind_num, count=0;
  vector<int> record;

  ifstream genodata(geno_file);
  istringstream ins;
  string line;
  int lc=0;  // line number in the file

  if(getline(genodata,line)){  
    // get the first line
    ins.clear();
    ins.str(line);

    ins>>ws;
    if(ins >> alias >> alias >> alias >> alias >> alias >> alias){  // rsnumber; chromosome; position; orientation; alleleA; alleleB
      while(ins >> ind_num){
        if(individual_list.find(ind_num)!=individual_list.end()){
          individual_list[ind_num] = count;
          count++;
          record.push_back(1);
        }
        else  // individual is not in phenotype file
          record.push_back(0);
      }

      people_size = count;
      fprintf(outfp, "There are %d individuals listed in the genotype file, %d of whom are also in the phenotype file.\n\n", record.size(),people_size);
    }
  }

  while(getline(genodata,line)){
    lc++;
    ins.clear();
    ins.str(line);

    ins>>ws;
    if(ins.eof())
      continue;

    if(ins >> marker >> chr >> pos >> orientation >> alleleA >> alleleB){
      if(chrom == 0)
        chrom = chr;

      if(index_map.find(marker)!=index_map.end()){
        int index = index_map[marker];
        marker_info[index].onchip = 1;
        marker_info[index].order = snp_size;
        if(orientation == '+')
          marker_info[index].strand_switch = 0;
        else if(orientation == '-')
          marker_info[index].strand_switch = 1;

        marker_info[index].alleleA = alleleA;
        marker_info[index].alleleB = alleleB;

	    // mapping map order and target_list order
	    order2index[snp_size] = index;

        snp_size++;

        count = 0;
        while(ins >> geno){
          if(record[count]==1)
            genotype_line.push_back(geno);
          count++;
        }

        if(count!=record.size())
          printf("error in genofile\n");

        if(people_size != genotype_line.size()){
          fprintf(stderr, "Error: Unequal number of persons recorded in genotyo data file\n");
          return -1;
        }

        string *cl = new string[genotype_line.size()];
        for(int i=0;i<genotype_line.size();i++)
          cl[i] = genotype_line[i];
        genotype_line.clear();
        raw_content.push_back(cl);
      }
    }
    else
      continue;
  }

  record.clear();

  return convert_coding();
}


int PedData::convert_coding(){
    
  for(int i=0;i<snp_size;i++){
    if(order2index.find(i)!=order2index.end()){
      int index = order2index[i];
      char ta0 = marker_info[index].allele0;
      char ta1 = marker_info[index].allele1;
      char da0 = marker_info[index].alleleA;
      char da1 = marker_info[index].alleleB;
      if(da0>da1){
        char temp = da0;
        da0 = da1;
        da1 = temp;
      }

      int sw_flag = marker_info[index].strand_switch;
      
      map<string,int> geno_code;
      geno_code.clear();
      geno_code["NN"] = -1;
      geno_code["ND"] = -1;

      //check data/target coding system
      // da0 < da1 always true 
      if((ta0 == da0 && ta1 == da1) || (ta0 == da1 && ta1 == da0)){
	    // A/T SNP;
        if(ta0 == 'A' && ta1 == 'T' && da0 == 'A' && da1 == 'T'){
          if(sw_flag == 1){
            geno_code["AA"] = 2;
            geno_code["TA"] = geno_code["AT"] = 1;
            geno_code["TT"] = 0;
          }
          else{
            geno_code["AA"] = 0;
	        geno_code["TA"] = geno_code["AT"] = 1;
	        geno_code["TT"] = 2;
	      }
	    }
	    else if(ta0 == 'T' && ta1 == 'A' && da0 == 'A' && da1 == 'T'){
	      if(sw_flag == 1){
            geno_code["AA"] = 0;
	        geno_code["TA"] = geno_code["AT"] = 1;
	        geno_code["TT"] = 2;
	      }
          else{
	        geno_code["AA"] = 2;
	        geno_code["TA"] = geno_code["AT"] = 1;
	        geno_code["TT"] = 0;
	      }
	    }

        // C/G SNP
	    else if(ta0 == 'C' && ta1 == 'G' && da0 == 'C' && da1 == 'G'){
	      if(sw_flag == 1){
	        geno_code["CC"] = 2;
	        geno_code["CG"] = geno_code["GC"] = 1;
	        geno_code["GG"] = 0;
	      }
          else{
	        geno_code["CC"] = 0;
	        geno_code["CG"] = geno_code["GC"] = 1;
	        geno_code["GG"] = 2;
	      }
	    }
	    else if(ta0 == 'G' && ta1 == 'C' && da0 == 'C' && da1 == 'G'){
	      if(sw_flag == 1){
	        geno_code["CC"] = 0;
	        geno_code["CG"] = geno_code["GC"] = 1;
	        geno_code["GG"] = 2;
	      }
          else{
	        geno_code["CC"] = 2;
	        geno_code["CG"] = geno_code["GC"] = 1;
	        geno_code["GG"] = 0;
	      }
	    }

        // not A/T or C/G SNPs
	    else{
	      char AA[3];
	      char AB[3];
	      char BA[3];
	      char BB[3];
	  
	      AA[2]=AB[2]=BA[2]=BB[2]=0;
	  
	      AA[0]=da0;
	      AA[1]=da0;
	  
	      AB[0]=da0;
	      AB[1]=da1;
	  
	      BA[0]=da1;
	      BA[1]=da0;
	  
	      BB[0]=da1;
	      BB[1]=da1;
	  
	      geno_code[string(AB)]=geno_code[string(BA)]=1;
	  
	      if(ta0 == da0){
	        geno_code[string(AA)] = 0;
	        geno_code[string(BB)] = 2;
	      }
	      else if(ta0 == da1){
	        geno_code[string(AA)] = 2;
	        geno_code[string(BB)] = 0;
	      }
	    }
      }
      
      // not equal
      else{
	    if(da0 == 'G' && da1 == 'T'){
	      if(ta0 == 'A' && ta1 == 'C'){
	        geno_code["TT"] = 0;
	        geno_code["TG"] = geno_code["GT"] = 1;
	        geno_code["GG"] = 2;
	      }
	      else if(ta0 =='C' && ta1 == 'A'){
	        geno_code["TT"] = 2;
	        geno_code["TG"] = geno_code["GT"] = 1;
	        geno_code["GG"] = 0;
	      }
	    }

    	else if(da0 == 'C' && da1 == 'T'){
	      if(ta0 == 'A' && ta1 == 'G'){
	        geno_code["TT"] = 0;
	        geno_code["CT"] = geno_code["TC"] = 1;
	        geno_code["CC"] = 2;
	      }
	      else if(ta0 == 'G' && ta1 == 'A'){
	        geno_code["TT"] = 2;
	        geno_code["CT"] = geno_code["TC"] = 1;
	        geno_code["CC"] = 0;
	      }
	    }
	
	    else if(da0 == 'A' && da1 == 'G'){
	      if(ta0 == 'C' && ta1 == 'T'){
	        geno_code["GG"] = 0;
	        geno_code["AG"] = geno_code["GA"] = 1;
	        geno_code["AA"] = 2;
	      }
	      else if(ta0 == 'T' && ta1 == 'C'){
	        geno_code["GG"] = 2;
	        geno_code["AG"]  = geno_code["GA"] = 1;
	        geno_code["AA"] = 0;
	      }
	    }
	
	    else if(da0 == 'A' && da1 == 'C'){
	      if(ta0 == 'G' && ta1 == 'T'){
	        geno_code["CC"] = 0;
	        geno_code["AC"] = geno_code["CA"] = 1;
	        geno_code["AA"] = 2;
	      }
	      else if(ta0 == 'T' && ta1 == 'G'){
	        geno_code["CC"] = 2;
	        geno_code["AC"] = geno_code["CA"] = 1;
	        geno_code["AA"] = 0;
	      }
	    }
      }

      int *data = new int[people_size];
      for(int j=0;j<people_size;j++)
        data[j] = geno_code[raw_content[i][j]];
      delete[] raw_content[i];
	  content.push_back(data);
    }
  }
  
  raw_content.clear();

  return 1;
}


int PedData::load_parameter(char *parameter_file){
    
  double kp;

  ifstream parameterdata(parameter_file);
  istringstream ins;
  string line;

  getline(parameterdata,line);
  ins.clear();
  ins.str(line);

  ins>>ws;
  if(ins >> kp)
    prevalence = kp;

  return 1;
}


void PedData::assign(){

  for(int i=0;i<fam_list.size();i++)
    fam_list[i]->set();
}


void PedData::assign_gcode(int iter){
     
  vector<int> index;

  for(int i=0;i<nmark;i++){
    int ref_list_pos = index_map[marker_info[iter].anchor_list[i]];
    index.push_back(marker_info[ref_list_pos].order);
  }

  int *gd = new int[nmark];
  int count=0;
  int count_aff=0;
  int count_unaff=0;
  int count_unknown=0;
  
  for(int i=0;i<fam_list.size();i++){
    fam_list[i]->set_gcode(nmark);
    for(int j=0;j<fam_list[i]->N;j++){
      int ind_num = fam_list[i]->fam_member[j][0];
      if(individual_list.find(ind_num)!=individual_list.end()){
        int col = individual_list[ind_num];
        if(col>=0){
          for(int k=0;k<nmark;k++)
            gd[k] = content[index[k]][col];
          fam_list[i]->set_gcode(gd,j);

          for(int k=0;k<nmark;k++){
            if(gd[k]!=-1){
              count++;
              if(fam_list[i]->fam_member[j][4]==2)
                count_aff++;
              else if(fam_list[i]->fam_member[j][4]==1)
                count_unaff++;
              else if(fam_list[i]->fam_member[j][4]==0)
                count_unknown++;

              break;
            }
          }
        }
      }
    }
    fam_list[i]->fill_typed();
  }
  delete[] gd;
  index.clear();

  fprintf(outfp, "There are %d individuals with at least one tag SNP genotyped.\n", count);
  fprintf(outfp, "Of these, %d are affected, %d are unaffected and %d are of unknown phenotype. \n\n", count_aff,count_unaff,count_unknown);
}


void PedData::assign_info(){
     
  for(int i=0;i<fam_list.size();i++)
    fam_list[i]->fill_info_gcode(geno_list);
}


void PedData::unique_gcode(){

  complete_geno_list.clear();
  int count=0;
  int *gd = new int[nmark];
  for(int k=0;k<nmark;k++)
    gd[k] = -1;  // genotypes are missing at all markers
  Genotype *geno = new Genotype(gd,hap_list,nmark);
  vector<int*> *complete_geno_code = geno->get_geno_code();  // complete-data genotypes
  count = complete_geno_code->size();
//  printf("There are %d complete genotypes\n", count);

  for(int i=0;i<count;i++){
    Genotype *complete_geno = new Genotype((*complete_geno_code)[i],hap_list,nmark);
    const vector<pair<int, int> > hv = complete_geno->get_hap_vec();
    if(hv.size()>0){
      complete_geno->compute_gname();
      complete_geno_list.push_back(complete_geno);
    }
    else
      delete complete_geno;
  }
  delete geno;
//  printf("There are %d complete genotypes stored\n", complete_geno_list.size());

  geno_list.clear();
  map<int, int> gmap;

  for(int i=0;i<fam_list.size();i++)
    for(int j=0;j<fam_list[i]->N;j++){
      int code=0;
      for(int k=0;k<nmark;k++){
        int gc = fam_list[i]->gcode[j][k];
        gd[k] = gc;
        code = 10*code+gc;
      }

      if(gmap.find(code)==gmap.end()){
        gmap[code] = geno_list.size();
        geno_list.push_back(new Genotype(gd,hap_list,nmark));
      }

      fam_list[i]->set_cat(gmap[code],j);
      geno_list[gmap[code]]->increase_num();
    }
  delete[] gd;
/*
  for(int i=0;i<complete_geno_list.size();i++)
    complete_geno_list[i]->show_gstat();
  printf("\n");

  for(int i=0;i<geno_list.size();i++)
    geno_list[i]->show_gstat();
*/
}


void PedData::EW_a_estimator(){
     
  HFEstimator est(fam_list,geno_list,hap_list,nmark);

  int count = 1;
  int flag;
  while(1){
    map<int, double> of = est.get_hap_freq();
    est.EM_Iteration(0);
    map<int, double> nf = est.get_hap_freq();

    double diff = 0;
    flag = 1;
    for(int i=0;i<of.size();i++){
      diff += (nf[i]-of[i])*(nf[i]-of[i]);
      if(nf[i]<0){
        flag = 0;
        break;
      }
    }

    if(diff<1e-8 || count>100 || flag==0)
      break;

    count++;
  }

  if(flag==1)
    hap_freq_table = est.get_hap_freq();

//  printf("It takes %d iterations.\n", count);
//  est.show_hap_freq();
}


void PedData::CDW_a_estimator(){
     
  // compute complete-data weight
  for(int i=0;i<fam_list.size();i++)
    fam_list[i]->compute_weight();

  // sum up
  for(int i=0;i<fam_list.size();i++)
    for(int j=0;j<fam_list[i]->N;j++){
      int cat=fam_list[i]->cat[j];
      geno_list[cat]->increase_count(fam_list[i]->weight[j]);
    }

  HFEstimator est(fam_list,geno_list,hap_list,nmark);
  est.reset_freq_table(hap_freq_table);

  int count = 1;
  int flag;
  while(1){
    map<int, double> of = est.get_hap_freq();
    est.EM_Iteration(1);
    map<int, double> nf = est.get_hap_freq();

    double diff = 0;
    flag = 1;
    for(int i=0;i<of.size();i++){
      diff += (nf[i]-of[i])*(nf[i]-of[i]);
      if(nf[i]<0){
        flag = 0;
        break;
      }
    }

    if(diff<1e-6 || count>20 || flag==0)
      break;

    count++;
  }

  if(flag==1)
    hap_freq_table = est.get_hap_freq();

//  printf("It takes %d iterations.\n", count);
//  est.show_hap_freq();
}


void PedData::unique_mis_patten(const vector<char *> & hpcontent){
     
  missing_list.clear();
  map<int, int> mis_map;

  for(int i=0;i<fam_list.size();i++)
    for(int j=0;j<fam_list[i]->N;j++){
      int code=0;
      vector<int> missing_site;
      for(int k=0;k<nmark;k++){
        int gc = fam_list[i]->gcode[j][k];
        if(gc==-1){
          code = 10*code+gc;
          missing_site.push_back(k);
        }
        else
          code*=10;
      }

      if(mis_map.find(code)==mis_map.end()){
        mis_map[code] = missing_list.size();
        missing_list.push_back(new Missing(missing_site,hap_list,hpcontent,nmark));
      }

      fam_list[i]->set_mis_cat(mis_map[code],j);
      missing_list[mis_map[code]]->increase_num();
      missing_site.clear();
    }
/*
  for(int i=0;i<missing_list.size();i++)
    printf("%d %d, ", missing_list[i]->dim,missing_list[i]->mapping_function.size());
  printf("\n");
*/
}


void PedData::ATRIUM_a_estimator(map<int, int> &trans_map, vector<char> &a0v, vector<char> &a1v, int loc){
     
  for(int i=0;i<complete_geno_list.size();i++)
    complete_geno_list[i]->compute_gname();

  // IQLa
  IQLAEstimator est(fam_list,complete_geno_list,geno_list,missing_list,hap_list,nmark);
  est.set_output(outfp);
  est.reset_freq_table(hap_freq_table);

  int count = 1;
  int flag;
  est.allocate();
  while(1){
    map<int, double> of = est.get_hap_freq();
    est.NR_Iteration_new();
    map<int, double> nf = est.get_hap_freq();

    double diff = 0;
    flag = 1;
    for(int i=0;i<of.size();i++){
      diff += (nf[i]-of[i])*(nf[i]-of[i]);
      if(nf[i]<0){
        flag = 0;
        break;
      }
    }

    if(diff<1e-5 || count>7 || flag==0)
      break;

    count++;
  }

  if(flag==1)
    hap_freq_table = est.get_hap_freq();

//  printf("It takes %d iterations.\n", count);
//  est.show_hap_freq(trans_map,a0v,a1v);

  // compute test weight
  double mean = -prevalence/(1-prevalence);
  for(int i=0;i<fam_list.size();i++)
    fam_list[i]->compute_test_weight(mean);

  hap_weight.clear();
  double sum_1 = 0;
  double sum_2 = 0;
  map<int, double>::iterator iter = hap_freq_table.begin();
  while(iter!=hap_freq_table.end()){
    int h = iter->first;
    if(weight_table.find(h)!=weight_table.end()){
      sum_1 += (iter->second*weight_table[h]);
      sum_2 += (iter->second*(1-weight_table[h]));
    }
    iter++;
  }

  iter = hap_freq_table.begin();
  while(iter!=hap_freq_table.end()){
    int h = iter->first;
    if(weight_table.find(h)!=weight_table.end())
      hap_weight[h] = weight_table[h]/sum_1-(1-weight_table[h])/sum_2;
    else
      hap_weight[h] = 0;

    iter++;
  }
  
  // association test
  // IQLS_a test
  est.Score_test_new(hap_weight);
//  est.show_test();
  est.deallocate();

  p = est.get_p_value();
  stat = est.get_stat();
}


void PedData::unique_trio_gcode(){

  geno_trio_list.clear();
  vector<trio> triolist;
  oneparent = 0;

  for(int i=0;i<fam_list.size();i++){
    for(int j=0;j<fam_list[i]->N;j++){
      trio m;
      int id_1, id_2, ind_1, ind_2;
      int loc;
      int founder=0;
      int cat_1, cat_2;
      int typed_1, typed_2;
      int swi=0;

      m.cat_o = fam_list[i]->cat[j];
      m.typed_o = fam_list[i]->typed[j];
      
      ind_1 = fam_list[i]->fam_member[j][1];
      ind_2 = fam_list[i]->fam_member[j][2];
      if(ind_1==0 && ind_2==0){  // j is a founder
        m.cat_par1 = -1;
        m.typed_par1 = 0;
        m.cat_par2 = -1;
        m.typed_par2 = 0;
        founder = 1;
      }
      else{
        id_1 = fam_list[i]->member_map[ind_1];
        cat_1 = fam_list[i]->cat[id_1];
        typed_1 = fam_list[i]->typed[id_1];
        id_2 = fam_list[i]->member_map[ind_2];
        cat_2 = fam_list[i]->cat[id_2];
        typed_2 = fam_list[i]->typed[id_2];
        
        if(typed_1==typed_2){  // 00 or 11
          if(cat_1<=cat_2){
            m.cat_par1 = cat_1;
            m.typed_par1 = typed_1;
            m.cat_par2 = cat_2;
            m.typed_par2 = typed_2;               
          }
          else{
            m.cat_par1 = cat_2;
            m.typed_par1 = typed_2;
            m.cat_par2 = cat_1;
            m.typed_par2 = typed_1;
            swi = 1;   
          }
        }
        else if(typed_1<typed_2){  // 01
          m.cat_par1 = cat_2;
          m.typed_par1 = typed_2;
          m.cat_par2 = cat_1;
          m.typed_par2 = typed_1;
          swi = 1;
        }
        else{  // 10
          m.cat_par1 = cat_1;
          m.typed_par1 = typed_1;
          m.cat_par2 = cat_2;
          m.typed_par2 = typed_2;     
        }                                 
      }       

      loc = find_loc(triolist,m);
      if(loc==-1){  // not found
        fam_list[i]->set_trio_cat(geno_trio_list.size(),j);
        triolist.push_back(m);
        if(founder==1)
          geno_trio_list.push_back(new Genotypetrio(geno_list[m.cat_o],m.typed_o,0,0,nmark));
        else
          geno_trio_list.push_back(new Genotypetrio(geno_list[m.cat_o],geno_list[m.cat_par1],geno_list[m.cat_par2],m.typed_o,m.typed_par1,m.typed_par2,nmark));

        geno_trio_list[geno_trio_list.size()-1]->increase_num();

        if(oneparent==0){
          if((typed_1==1 && typed_2==0) || (typed_1==0 && typed_2==1))
            oneparent = 1;
        }
      }
      else{
        fam_list[i]->set_trio_cat(loc,j);
        geno_trio_list[loc]->increase_num();
      }
    }
  }

  triolist.clear();
}


void PedData::unique_mis_trio_patten(const vector<char *> & hpcontent){

  missing_trio_list.clear();
  vector<trio> mis_triolist;

  for(int i=0;i<fam_list.size();i++){
    for(int j=0;j<fam_list[i]->N;j++){
      trio m;
      int id_1, id_2, ind_1, ind_2;
      int loc;
      int founder=0;
      int cat_1, cat_2;
      int typed_1, typed_2;
      int swi=0;

      m.cat_o = fam_list[i]->mis_cat[j];
      m.typed_o = fam_list[i]->typed[j];

      ind_1 = fam_list[i]->fam_member[j][1];
      ind_2 = fam_list[i]->fam_member[j][2];
      if(ind_1==0 && ind_2==0){  // j is a founder
        m.cat_par1 = -1;
        m.typed_par1 = 0;
        m.cat_par2 = -1;
        m.typed_par2 = 0;
        founder = 1;
      }
      else{
        id_1 = fam_list[i]->member_map[ind_1];
        cat_1 = fam_list[i]->mis_cat[id_1];
        typed_1 = fam_list[i]->typed[id_1];
        id_2 = fam_list[i]->member_map[ind_2];
        cat_2 = fam_list[i]->mis_cat[id_2];
        typed_2 = fam_list[i]->typed[id_2];
        
        if(typed_1==typed_2){  // 00 or 11
          if(cat_1<=cat_2){
            m.cat_par1 = cat_1;
            m.typed_par1 = typed_1;
            m.cat_par2 = cat_2;
            m.typed_par2 = typed_2;               
          }
          else{
            m.cat_par1 = cat_2;
            m.typed_par1 = typed_2;
            m.cat_par2 = cat_1;
            m.typed_par2 = typed_1;
            swi = 1;   
          }
        }
        else if(typed_1<typed_2){  // 01
          m.cat_par1 = cat_2;
          m.typed_par1 = typed_2;
          m.cat_par2 = cat_1;
          m.typed_par2 = typed_1;
          swi = 1;
        }
        else{  // 10
          m.cat_par1 = cat_1;
          m.typed_par1 = typed_1;
          m.cat_par2 = cat_2;
          m.typed_par2 = typed_2;     
        }                                 
      }

      loc = find_loc(mis_triolist,m);
      if(loc==-1){  // not found
        fam_list[i]->set_mis_trio_cat(missing_trio_list.size(),j);
        mis_triolist.push_back(m);
        if(founder==1)
          missing_trio_list.push_back(new Missingtrio(missing_list[m.cat_o],m.typed_o,0,0,hap_list,hpcontent,nmark));
        else
          missing_trio_list.push_back(new Missingtrio(missing_list[m.cat_o],missing_list[m.cat_par1],missing_list[m.cat_par2],m.typed_o,m.typed_par1,m.typed_par2,hap_list,hpcontent,nmark));

        missing_trio_list[missing_trio_list.size()-1]->increase_num();
      }
      else{
        fam_list[i]->set_mis_trio_cat(loc,j);
        missing_trio_list[loc]->increase_num();
      }
      fam_list[i]->set_mis_par_order(swi,j);
    }
  }

  mis_triolist.clear();
}


void PedData::ATRIUM_b_estimator(map<int, int> &trans_map, vector<char> &a0v, vector<char> &a1v, int loc){
     
  IQLBEstimator est(fam_list,geno_trio_list,missing_trio_list,hap_list,nmark,oneparent);
  est.set_output(outfp);
  est.reset_freq_table(hap_freq_table);

  int count = 1;
  int flag;
  est.allocate();
  while(1){
    map<int, double> of = est.get_hap_freq();
    est.NR_Iteration_new();
    map<int, double> nf = est.get_hap_freq();

    double diff = 0;
    flag = 1;
    for(int i=0;i<of.size();i++){
      diff += (nf[i]-of[i])*(nf[i]-of[i]);
      if(nf[i]<0){
        flag = 0;
        break;
      }
    }

    if(diff<1e-5 || count>5 || flag==0)
      break;

    count++;
  }

  if(flag==1)
    hap_freq_table = est.get_hap_freq();

//  printf("It takes %d iterations.\n", count);
  est.show_hap_freq(trans_map,a0v,a1v);

  est.estimate_subgroup();

  // compute test trio weight
  double mean = -prevalence/(1-prevalence);
  for(int i=0;i<fam_list.size();i++)
    fam_list[i]->compute_test_trio_weight(mean);

  hap_weight.clear();
  double sum_1 = 0;
  double sum_2 = 0;
  double sum_hap = 0;
  map<int, double>::iterator iter = hap_freq_table.begin();
  while(iter!=hap_freq_table.end()){
    int h = iter->first;
    if(weight_table.find(h)!=weight_table.end()){
      sum_1 += (iter->second*weight_table[h]);
      sum_2 += (iter->second*(1-weight_table[h]));
      sum_hap += iter->second;
    }
    iter++;
  }
  sum_hap = sum_1/sum_hap;

  iter = hap_freq_table.begin();
  while(iter!=hap_freq_table.end()){
    int h = iter->first;
    if(weight_table.find(h)!=weight_table.end())
      hap_weight[h] = weight_table[h]/sum_1-(1-weight_table[h])/sum_2;
    else
      hap_weight[h] = 0;

    iter++;
  }

  est.Score_test_new(hap_weight);
  est.show_test(stat);
  est.deallocate();

  p = est.get_p_value();

  fprintf(outfp, "\nAllele frequency estimates for the untyped SNP\n");
  fprintf(outfp, "           full sample      ");
  if(est.aff>=15)
    fprintf(outfp, "affected       ");
  if(est.unaff>=15)
    fprintf(outfp, "unaffected       ");
  if(est.unknown>=15)
    fprintf(outfp, "unknown");
  fprintf(outfp, "\n");

  double sum_aff = 0;
  double sum_unaff = 0;
  double sum_unknown = 0;
  fprintf(outfp, "allele %c: freq = %.4f   ", marker_info[loc].allele0,sum_hap);
  if(est.aff>=15){
    sum_1 = 0;
    sum_aff = 0;
    iter = (est.aff_hap_freq_table).begin();
    while(iter!=(est.aff_hap_freq_table).end()){
      int h = iter->first;
      if(weight_table.find(h)!=weight_table.end()){
        sum_1 += (iter->second*weight_table[h]);
        sum_aff += iter->second;
      }
      iter++;
    }
    sum_aff = sum_1/sum_aff;
    fprintf(outfp, "freq = %.4f   ", sum_aff);
  }
  if(est.unaff>=15){
    sum_1 = 0;
    sum_unaff = 0;
    iter = (est.unaff_hap_freq_table).begin();
    while(iter!=(est.unaff_hap_freq_table).end()){
      int h = iter->first;
      if(weight_table.find(h)!=weight_table.end()){
        sum_1 += (iter->second*weight_table[h]);
        sum_unaff += iter->second;
      }
      iter++;
    }
    sum_unaff = sum_1/sum_unaff;
    fprintf(outfp, "freq = %.4f   ", sum_unaff);
  }
  if(est.unknown>=15){
    sum_1 = 0;
    sum_unknown = 0;
    iter = (est.unknown_hap_freq_table).begin();
    while(iter!=(est.unknown_hap_freq_table).end()){
      int h = iter->first;
      if(weight_table.find(h)!=weight_table.end()){
        sum_1 += (iter->second*weight_table[h]);
        sum_unknown += iter->second;
      }
      iter++;
    }
    sum_unknown = sum_1/sum_unknown;
    fprintf(outfp, "freq = %.4f   ", sum_unknown);
  }
  fprintf(outfp, "\n");

  fprintf(outfp, "allele %c: freq = %.4f   ", marker_info[loc].allele1,1-sum_hap);
  if(est.aff>=15)
    fprintf(outfp, "freq = %.4f   ", 1-sum_aff);
  if(est.unaff>=15)
    fprintf(outfp, "freq = %.4f   ", 1-sum_unaff);
  if(est.unknown>=15)
    fprintf(outfp, "freq = %.4f   ", 1-sum_unknown);
  fprintf(outfp, "\n");

  if(est.aff>0 && est.aff*(sum_aff<=1-sum_aff ? sum_aff : 1-sum_aff)<5)
    fprintf(outfp, "The p-value might not be exact because of the small number of minor alleles in affecteds\n");
  if(est.unaff>=est.unknown){
    if(est.unaff>0 && est.unaff*(sum_unaff<=1-sum_unaff ? sum_unaff : 1-sum_unaff)<5)
      fprintf(outfp, "The p-value might not be exact because of the small number of minor alleles in unaffecteds\n");
  }
  else{
    if(est.unknown>0 && est.unknown*(sum_unknown<=1-sum_unknown ? sum_unknown : 1-sum_unknown)<5)
      fprintf(outfp, "The p-value might not be exact because of the small number of minor alleles in unknowns\n");
  }
}


void PedData::print_data(){
     
  for(int i=0;i<fam_list.size();i++)
    fam_list[i]->print();
}


void PedData::mem_clean(){
     
  for(int i=0;i<fam_list.size();i++)
    fam_list[i]->delete_gcode();

  for(int i=0;i<complete_geno_list.size();i++)
    delete complete_geno_list[i];
  complete_geno_list.clear();

  for(int i=0;i<geno_list.size();i++)
    delete geno_list[i];
  geno_list.clear();

  for(int i=0;i<missing_list.size();i++)
    delete missing_list[i];
  missing_list.clear();

  for(int i=0;i<geno_trio_list.size();i++)
    delete geno_trio_list[i];
  geno_trio_list.clear();

  for(int i=0;i<missing_trio_list.size();i++)
    delete missing_trio_list[i];
  missing_trio_list.clear();
}


PedData::~PedData(){
                    
  for(int i=0;i<fam_list.size();i++)
    delete fam_list[i];
  fam_list.clear();

  fam_map.clear();
  hap_list.clear();
  hap_freq_table.clear();

  weight_table.clear();
  hap_weight.clear();

  individual_list.clear();

  for(int i=0;i<snp_size;i++)
    delete[] content[i];
  content.clear();
}


vector<string> tokenize(const string& str, const string& delimiters){
               
  vector<string> tokens;
    	
  // skip delimiters at beginning.
  string::size_type lastPos = str.find_first_not_of(delimiters,0);
    	
  // find first "non-delimiter".
  string::size_type pos = str.find_first_of(delimiters,lastPos);

  while(string::npos != pos || string::npos != lastPos){
    // found a token, add it to the vector.
    tokens.push_back(str.substr(lastPos,pos-lastPos));
		
    // skip delimiters.  Note the "not_of"
    lastPos = str.find_first_not_of(delimiters,pos);
    
    // find next "non-delimiter"
    pos = str.find_first_of(delimiters,lastPos);
  }
  
  return tokens;
}


int find_loc(const vector<trio> & s_vec, const trio & m)
{
  for(int i=0;i<s_vec.size();i++)
  {
    if(s_vec[i].cat_o==m.cat_o)
    {
      if(s_vec[i].cat_par1==m.cat_par1 || (s_vec[i].typed_par1==0 && m.typed_par1==0))
      {
        if(s_vec[i].cat_par2==m.cat_par2 || (s_vec[i].typed_par2==0 && m.typed_par2==0))
          return i;
      }
    }
  }
  return -1;
}


bool inc_sort(const punit& a, const punit& b){
     
  return a.p < b.p;
}