PolySNP.cc

00001 #include <cmath>
00002 #include <cfloat>
00003 #include <cassert> 
00004 #include <utility>
00005 #include <cstdlib>
00006 #include <cctype>
00007 #include <set>
00008 #include <algorithm>
00009 #include <boost/bind.hpp>
00010 #include <Sequence/PolyTable.hpp>
00011 #include <Sequence/Comparisons.hpp>
00012 #include <Sequence/Recombination.hpp>
00013 #include <Sequence/PolySNP.hpp>
00014 #include <Sequence/stateCounter.hpp>
00015 #include <Sequence/SeqConstants.hpp>
00016 #include <Sequence/PolySNPimpl.hpp>
00017 
00023 using std::string;
00024 using namespace Sequence::Recombination;
00025 
00026 namespace Sequence
00027 {
00028   struct uniqueSeq : public std::binary_function<std::string,std::string,bool>
00029   {
00030     inline bool operator()(const std::string & l, const std::string  & r) const
00031     {
00032       //use Sequence::Different to prevent missing sites 
00033       //causing 2 sequences to be labelled as distinct
00034       return ( l<r && Different(l,r) );
00035     }
00036   };
00037   _PolySNPImpl::_PolySNPImpl (const Sequence::PolyTable * data, bool haveOutgroup ,
00038                               unsigned outgroup, bool totMuts):
00039     _data(data),
00040     _nsites(data->numsites()),
00041     _nsam(data->size()),
00042     _outgroup(outgroup),
00043     _haveOutgroup(haveOutgroup),
00044     _totMuts(totMuts),
00045     _totsam (data->size()),
00046     _DVK(0),
00047     _DVH(1.0),
00048     _counted_singletons(false),
00049     _know_pi(false),
00050     _CalculatedDandV(false),
00051     _pi(0.),
00052     _singletons(0),
00053     _walls_Bprime(0),
00054     _NumPoly(0),
00055     _walls_B(0.),_walls_Q(0.),
00056     _calculated_wall_stats(false),
00057     _counts(std::vector< Sequence::stateCounter >(_nsites,Sequence::stateCounter('-'))),
00058     _derivedCounts(std::vector< std::pair< bool,  stateCounter > >
00059                    (_nsites,
00060                     std::make_pair<bool,stateCounter>(true,stateCounter('-')))),
00061     _preprocessed(false)
00062   {
00063     if (haveOutgroup)
00064       --_totsam;        //because one sequence in data is an outgroup!
00065     preprocess();
00066   }
00067 
00068   void  _PolySNPImpl::preprocess()
00084   {
00085     if (!_preprocessed)
00086       {
00087         for(unsigned site = 0 ; site < _nsites ; ++site)
00088           {
00089             for (unsigned seq = 0 ; seq < _nsam ; ++seq)
00090               {
00091                 //process counts w/o respect to
00092                 //ancestral or derived
00093                 if (!_haveOutgroup
00094                     || (_haveOutgroup && seq != _outgroup))
00095                   {
00096                     _counts[site]( (*_data)[seq][site] );
00097                   }
00098                 //process derived states if outgroup is
00099                 //present
00100                 if (_haveOutgroup == true)
00101                   {
00102                     //if outgroup state is missing data
00103                     //or a gap
00104                     //set the bool for that site to false
00105                     if ( std::toupper((*_data)[_outgroup][site]) == 'N' ||
00106                          (*_data)[_outgroup][site] == '-')
00107                       {
00108                         _derivedCounts[site].first = false;
00109                       }
00110                     else
00111                       {
00112                         //tabulate the derived state
00113                         _derivedCounts[site].first = true;
00114                         if ( seq != _outgroup && (*_data)[seq][site] != (*_data)[_outgroup][site] )
00115                           {
00116                             _derivedCounts[site].second((*_data)[seq][site]);
00117                           }
00118                       }
00119                   }
00120                 else
00121                   {
00122                     //if no outgroup, set bool
00123                     //for site to false
00124                     _derivedCounts[site].first = false;
00125                   }
00126               }
00127             if(_counts[site].nStates() > 1 && _counts[site].gap==0) ++_NumPoly;
00128           }
00129         _preprocessed = true;
00130       }
00131   }
00132 
00133   PolySNP::PolySNP (const Sequence::PolyTable * data, bool haveOutgroup ,
00134                     unsigned outgroup, bool totMuts):
00135     rep(std::auto_ptr<_PolySNPImpl>(new _PolySNPImpl(data,haveOutgroup,outgroup,totMuts)))
00145   {}
00146 
00147   PolySNP::~PolySNP (void)
00148   {
00149   }
00150 
00151   double
00152   PolySNP::ThetaPi (void)
00179   {
00180     assert ( rep->_preprocessed );
00181     if (rep->_know_pi == false)
00182       {
00183         double Pi = 0.0;
00184         for (unsigned i = 0;  i < rep->_nsites; ++i)
00185           {                     //iterate over sites
00186             if ( rep->_counts[i].gap == 0 &&
00187                  rep->_counts[i].nStates() > 1 )
00188               {
00189                 unsigned samplesize = rep->_totsam;
00190                 double SSH = 0.0;       //sum of site homozygosity
00191                 samplesize -= rep->_counts[i].n; //adjust sample size for missing data
00192                 if (samplesize > 1)
00193                   {
00194                     double denom = (double(samplesize)* (double(samplesize) - 1.0));
00195                     SSH += (rep->_counts[i].a > 0) ? double(rep->_counts[i].a) * 
00196                       double (rep->_counts[i].a-1) /denom : 0. ;
00197                     SSH += (rep->_counts[i].g > 0) ? double(rep->_counts[i].g) * 
00198                       double (rep->_counts[i].g-1) /denom : 0. ;
00199                     SSH += (rep->_counts[i].c > 0) ? double(rep->_counts[i].c) * 
00200                       double (rep->_counts[i].c-1) /denom : 0. ;
00201                     SSH += (rep->_counts[i].t > 0) ? double(rep->_counts[i].t) * 
00202                       double (rep->_counts[i].t-1) /denom : 0. ;
00203                     SSH += (rep->_counts[i].zero > 0) ? double(rep->_counts[i].zero) * 
00204                       double (rep->_counts[i].zero-1) /denom : 0. ;
00205                     SSH += (rep->_counts[i].one > 0) ? double(rep->_counts[i].one) * 
00206                       double (rep->_counts[i].one-1) /denom : 0. ;
00207                     Pi += (1.0 - SSH);
00208                   }
00209               }
00210           }
00211         rep->_pi = Pi;
00212         rep->_know_pi = true;
00213         return rep->_pi;
00214       }
00215     else
00216       return rep->_pi;
00217     return rep->_pi;
00218   }
00219 
00220 
00221   double
00222   PolySNP::ThetaW (void)
00240   {
00241     assert ( rep->_preprocessed );
00242     double W = 0.0;
00243     for (unsigned i = 0;  i < rep->_nsites; ++i)
00244       {                 //iterate over sitesvv
00245         if ( rep->_counts[i].gap == 0)
00246           {
00247             int nstates = rep->_counts[i].nStates();
00248             unsigned nsam_site = rep->_totsam - rep->_counts[i].n;
00249             double denom=0.0;
00250             if(rep->_totMuts==true&&nstates>=2)
00251               {
00252                 for(unsigned i = 1 ; i < nsam_site ; ++i)
00253                   denom += 1.0/double(i);
00254                 W += double(nstates-1)/denom;
00255               }
00256             else if (rep->_totMuts==false &&nstates>=2)
00257               {
00258                 for(unsigned i = 1 ; i < nsam_site ; ++i)
00259                   denom += 1.0/double(i);
00260                 W += 1.0/denom;
00261               }
00262           }
00263       }
00264     return (W);
00265   }
00266 
00267   double
00268   PolySNP::ThetaH (void)
00300   {
00301     assert ( rep->_preprocessed );
00302     if (rep->_NumPoly==0)
00303       return 0.;
00304     if( !rep->_haveOutgroup)
00305       return strtod("NAN",NULL);
00306     double H = 0.0;
00307     bool anc_is_present = 0;   //is ancestral state present in the ingroup?
00308 
00309     for (unsigned i = 0;  i < rep->_nsites; ++i)
00310       {                 //iterate over sites
00311         if ( rep->_derivedCounts[i].second.gap == 0)
00312           {
00313             unsigned samplesize = rep->_totsam; //sample size per site
00314             unsigned sumDerCounts = 0;
00315             sumDerCounts += rep->_derivedCounts[i].second.a;
00316             sumDerCounts += rep->_derivedCounts[i].second.g;
00317             sumDerCounts += rep->_derivedCounts[i].second.c;
00318             sumDerCounts += rep->_derivedCounts[i].second.t;
00319             sumDerCounts += rep->_derivedCounts[i].second.zero;
00320             sumDerCounts += rep->_derivedCounts[i].second.one;
00321             unsigned ancestralCounts = samplesize-sumDerCounts-rep->_derivedCounts[i].second.n;
00322             
00323             //if the ancestral state is not missing data, and
00324             //the sum of the derived counts + and missing data in the ingroup
00325             //does not equal the sample size, then the ancestral state is present
00326             //at least once in the ingroup, and anc_is_present = true
00327             anc_is_present = (rep->_derivedCounts[i].first == true &&
00328                               ancestralCounts > 0 ) ? true : false; 
00329             if (anc_is_present) //ancestral state must be present
00330               {
00331                 //number of derived states seen at this site
00332                 int numDer = rep->_derivedCounts[i].second.nStates();   
00333                 if (numDer == 1)
00334                   {     //simple if there is only one derived state inferred
00335                     samplesize -= rep->_derivedCounts[i].second.n;
00336                     double denom = (double(samplesize) *
00337                                     (double(samplesize) - 1.0));
00338                     H += (rep->_derivedCounts[i].second.a > 0 ) ? 
00339                       (2.0 * pow (double (rep->_derivedCounts[i].second.a), 2.0)) / denom : 0. ;
00340                     H += (rep->_derivedCounts[i].second.g > 0 ) ? 
00341                       (2.0 * pow (double (rep->_derivedCounts[i].second.g), 2.0)) / denom : 0. ;
00342                     H += (rep->_derivedCounts[i].second.c > 0 ) ? 
00343                       (2.0 * pow (double (rep->_derivedCounts[i].second.c), 2.0)) / denom : 0. ;
00344                     H += (rep->_derivedCounts[i].second.t > 0 ) ? 
00345                       (2.0 * pow (double (rep->_derivedCounts[i].second.t), 2.0)) / denom : 0. ;
00346                     H += (rep->_derivedCounts[i].second.zero > 0) ? 
00347                       (2.0 * pow (double (rep->_derivedCounts[i].second.zero), 2.0)) / denom : 0. ;
00348                     H += (rep->_derivedCounts[i].second.one > 0 ) ? 
00349                       (2.0 * pow (double (rep->_derivedCounts[i].second.one), 2.0)) / denom : 0. ;
00350                   }
00351                 else if (numDer == 2 && rep->_haveOutgroup)     //MUST have outgroup--else can't proceed
00352                   {     //use a "missing data" scheme if there is >1 derived state
00353                     //iterate over derived states
00354                     int config[2];
00355                     unsigned k = 0;
00356                     if(rep->_derivedCounts[i].second.a > 0)
00357                       {
00358                         config[k++] = rep->_derivedCounts[i].second.a;
00359                       }
00360                     if(rep->_derivedCounts[i].second.g > 0)
00361                       {
00362                         config[k++] = rep->_derivedCounts[i].second.g;
00363                       }
00364                     if(rep->_derivedCounts[i].second.c > 0)
00365                       {
00366                         config[k++] = rep->_derivedCounts[i].second.c;
00367                       }
00368                     if(rep->_derivedCounts[i].second.t > 0)
00369                       {
00370                         config[k++] = rep->_derivedCounts[i].second.t;
00371                       }
00372                     if(rep->_derivedCounts[i].second.zero > 0)
00373                       {
00374                         config[k++] = rep->_derivedCounts[i].second.zero;
00375                       }
00376                     if(rep->_derivedCounts[i].second.one > 0)
00377                       {
00378                         config[k++] = rep->_derivedCounts[i].second.one;
00379                       }
00380                     for (int i = 0; i < 2; ++i)
00381                       {
00382                         double sample_size_adjust = (i==0) ? double(config[1]) : double(config[0]);
00383                         H += (2.0 *
00384                               pow (double (config[i]),
00385                                    2.0)) / ((double(samplesize) -
00386                                              sample_size_adjust)
00387                                             * (double(samplesize) -
00388                                                sample_size_adjust
00389                                                - 1.0));
00390                       }
00391                   }
00392               }
00393           }
00394       }
00395     return H;
00396   }
00397 
00398   double
00399   PolySNP::ThetaL (void)
00432   {
00433     assert(rep->_haveOutgroup==true);
00434     assert ( rep->_preprocessed );
00435     double thetal = 0.0;
00436     if(rep->_NumPoly==0) return thetal;
00437     bool anc_is_present = 0;   //is ancestral state present in the ingroup?
00438                   
00439     for (unsigned i = 0;  i < rep->_nsites; ++i)
00440       {                 //iterate over sites
00441         if (rep->_derivedCounts[i].first == true && rep->_derivedCounts[i].second.gap == 0)
00442           {
00443             unsigned samplesize = rep->_totsam; //sample size per site
00444             unsigned sumDerCounts = 0;
00445             sumDerCounts += rep->_derivedCounts[i].second.a;
00446             sumDerCounts += rep->_derivedCounts[i].second.g;
00447             sumDerCounts += rep->_derivedCounts[i].second.c;
00448             sumDerCounts += rep->_derivedCounts[i].second.t;
00449             sumDerCounts += rep->_derivedCounts[i].second.zero;
00450             sumDerCounts += rep->_derivedCounts[i].second.one;
00451             unsigned ancestralCounts = samplesize-sumDerCounts-rep->_derivedCounts[i].second.n;
00452                           
00453             //if the ancestral state is not missing data, and
00454             //the sum of the derived counts + and missing data in the ingroup
00455             //does not equal the sample size, then the ancestral state is present
00456             //at least once in the ingroup, and anc_is_present = true
00457             anc_is_present = (rep->_derivedCounts[i].first == true &&
00458                               ancestralCounts > 0 ) ? true : false; 
00459             if (anc_is_present) //ancestral state must be present
00460               {
00461                 //number of derived states seen at this site
00462                 int numDer = rep->_derivedCounts[i].second.nStates();   
00463                 if (numDer == 1)
00464                   {     //simple if there is only one derived state inferred
00465                     samplesize -= rep->_derivedCounts[i].second.n;
00466                     double denom = (double(samplesize) - 1.0);
00467                     thetal += (rep->_derivedCounts[i].second.a > 0 ) ? 
00468                       double (rep->_derivedCounts[i].second.a) / denom : 0. ;
00469                     thetal += (rep->_derivedCounts[i].second.g > 0 ) ? 
00470                       double (rep->_derivedCounts[i].second.g) / denom : 0. ;
00471                     thetal += (rep->_derivedCounts[i].second.c > 0 ) ? 
00472                       double (rep->_derivedCounts[i].second.c) / denom : 0. ;
00473                     thetal += (rep->_derivedCounts[i].second.t > 0 ) ? 
00474                       double (rep->_derivedCounts[i].second.t) / denom : 0. ;
00475                     thetal += (rep->_derivedCounts[i].second.zero > 0) ? 
00476                       double (rep->_derivedCounts[i].second.zero) / denom : 0. ;
00477                     thetal += (rep->_derivedCounts[i].second.one > 0 ) ? 
00478                       double (rep->_derivedCounts[i].second.one) / denom : 0. ;
00479                   }
00480                 else if (numDer == 2 && rep->_haveOutgroup)     //MUST have outgroup--else can't proceed
00481                   {     //use a "missing data" scheme if there is >1 derived state
00482                     //iterate over derived states
00483                     int config[2];
00484                     unsigned k = 0;
00485                     if(rep->_derivedCounts[i].second.a > 0)
00486                       {
00487                         config[k++] = rep->_derivedCounts[i].second.a;
00488                       }
00489                     if(rep->_derivedCounts[i].second.g > 0)
00490                       {
00491                         config[k++] = rep->_derivedCounts[i].second.g;
00492                       }
00493                     if(rep->_derivedCounts[i].second.c > 0)
00494                       {
00495                         config[k++] = rep->_derivedCounts[i].second.c;
00496                       }
00497                     if(rep->_derivedCounts[i].second.t > 0)
00498                       {
00499                         config[k++] = rep->_derivedCounts[i].second.t;
00500                       }
00501                     if(rep->_derivedCounts[i].second.zero > 0)
00502                       {
00503                         config[k++] = rep->_derivedCounts[i].second.zero;
00504                       }
00505                     if(rep->_derivedCounts[i].second.one > 0)
00506                       {
00507                         config[k++] = rep->_derivedCounts[i].second.one;
00508                       }
00509                     for (int i = 0; i < 2; ++i)
00510                       {
00511                         double sample_size_adjust = (i==0) ? double(config[1]) : double(config[0]);
00512                         thetal += (double (config[i])) / (double(samplesize) -
00513                                                           sample_size_adjust- 1.0);
00514                       }
00515                   }
00516               }
00517           }
00518       }
00519     return thetal;
00520   }
00521 
00522   unsigned
00523   PolySNP::NumPoly (void)
00527   {
00528     assert ( rep->_preprocessed );
00529     unsigned npoly = 0;
00530     for (unsigned i = 0;  i < rep->_nsites; ++i)
00531       {                 //iterate over sites
00532         if (rep->_counts[i].nStates() > 1 && 
00533             rep->_counts[i].gap == 0)
00534           ++npoly;
00535       }
00536     return npoly;
00537   }
00538 
00539   unsigned
00540   PolySNP::NumMutations (void)
00545   {
00546     assert( rep->_preprocessed );
00547     unsigned nmut = 0;
00548 
00549     for (unsigned i = 0;  i < rep->_nsites; ++i)
00550       {                 //iterate over sites
00551         int nstates = (rep->_counts[i].gap==0) ? rep->_counts[i].nStates() : 0;
00552         if (nstates > 1)
00553           nmut += nstates - 1;
00554       }
00555     return nmut;
00556   }
00557 
00558   unsigned
00559   PolySNP::NumSingletons (void)
00563   {
00564     assert ( rep->_preprocessed );
00565     unsigned nsing = 0,nstates;
00566     for (unsigned i = 0;  i < rep->_nsites; ++i)
00567       {                 //iterate over sites
00568         unsigned curr_nsing=0,nsam=0;
00569         nstates = rep->_counts[i].nStates();
00570         if (rep->_counts[i].gap == 0 && nstates>1)
00571           {
00572             nsam = rep->_totsam - rep->_counts[i].n;
00573             if(nsam==2 && nstates ==2) //if n = 2 and there are 2 states, there must be 1 singleton
00574               curr_nsing=1;
00575             else
00576               {
00577                 curr_nsing += (rep->_counts[i].a == 1) ? 1 : 0;
00578                 curr_nsing += (rep->_counts[i].g == 1) ? 1 : 0;
00579                 curr_nsing += (rep->_counts[i].c == 1) ? 1 : 0;
00580                 curr_nsing += (rep->_counts[i].t == 1) ? 1 : 0;
00581                 curr_nsing += (rep->_counts[i].zero == 1) ? 1 : 0;
00582                 curr_nsing += (rep->_counts[i].one == 1) ? 1 : 0;
00583               }
00584           }
00585         nsing += curr_nsing;
00586       }
00587     return nsing;
00588   }
00589 
00590 
00591   unsigned
00592   PolySNP::NumExternalMutations (void)
00597   {
00598     if(!rep->_haveOutgroup) return SEQMAXUNSIGNED;
00599     assert ( rep->_preprocessed );
00600     int next = 0;
00601     for (unsigned i = 0;  i < rep->_nsites ; ++i)
00602       {                 //iterate over sites
00603         unsigned nsam=rep->_totsam;
00604         unsigned curr_next=0;
00605         if(rep->_derivedCounts[i].first == true && 
00606            rep->_derivedCounts[i].second.gap == 0)
00607           {
00608             nsam -= rep->_derivedCounts[i].second.n;
00609             curr_next += (rep->_derivedCounts[i].second.a == 1) ? 1 : 0;
00610             curr_next += (rep->_derivedCounts[i].second.g == 1) ? 1 : 0;
00611             curr_next += (rep->_derivedCounts[i].second.c == 1) ? 1 : 0;
00612             curr_next += (rep->_derivedCounts[i].second.t == 1) ? 1 : 0;
00613             curr_next += (rep->_derivedCounts[i].second.zero == 1) ? 1 : 0;
00614             curr_next += (rep->_derivedCounts[i].second.one == 1) ? 1 : 0;
00615           }
00616         next += (nsam>1) ? curr_next : 0;
00617       }
00618     return next;
00619   }
00620 
00621 
00622   double
00623   PolySNP::TajimasD (void)
00631   {
00632     assert ( rep->_preprocessed );
00633     if(rep->_NumPoly==0) return strtod("NAN",NULL);
00634     double D = 0.0;
00635     double Pi = ThetaPi ();
00636     double W = ThetaW ();
00637     if (fabs(Pi-0.) <= DBL_EPSILON && fabs(W-0.) <= DBL_EPSILON)
00638       D = 0.0;
00639     else
00640       D = (Pi - W) / Dnominator ();
00641     return D;
00642   }
00643 
00644   double PolySNP::Hprime (bool likeThorntonAndolfatto)
00657   {
00658     assert ( rep->_preprocessed );
00659     if(rep->_NumPoly==0) return strtod("NAN",NULL);
00660     assert(rep->_haveOutgroup==true);
00661     double Hpr = 0.0;
00662     double a = a_sub_n ();
00663     double b = b_sub_n ();
00664     double pi = ThetaPi ();
00665     double theta = ThetaW();
00666                  
00667     double thetal = ThetaL();
00668     double b_n_plus1 = b_sub_n_plus1();
00669     double S = (rep->_totMuts) ? NumMutations() : NumPoly();
00670     double thetasq = (likeThorntonAndolfatto == false ) ? S * (S-1)/(a*a + b) : theta*theta;
00671                   
00672     double vThetal = 
00673       (rep->_totsam * theta)/(2.0 * (rep->_totsam - 1.0)) 
00674       + (2.0 * pow(rep->_totsam/(rep->_totsam - 1.0), 2.0) * (b_n_plus1 - 1.0) - 1.0) * thetasq;
00675                   
00676     double vPi = 
00677       (3.0 * rep->_totsam *(rep->_totsam + 1.0) * theta 
00678        + 2.0 * ( rep->_totsam * rep->_totsam + rep->_totsam + 3.0) * thetasq  )
00679       / (9 * rep->_totsam * (rep->_totsam -1.0));
00680                   
00681     double cov = 
00682       ((rep->_totsam + 1.0) / (3.0 * (rep->_totsam - 1.0))) * theta
00683       +(( 7.0 * rep->_totsam * rep->_totsam + 3.0 * rep->_totsam - 2.0 - 4.0 * rep->_totsam *( rep->_totsam + 1.0) * b_n_plus1)
00684         / (2.0 * pow ((rep->_totsam - 1.0), 2.0)))
00685       * thetasq  ;
00686                   
00687     //    Hpr = pi - omega;
00688     Hpr = pi - thetal;
00689     Hpr /= pow ( (vThetal + vPi - 2.0 * cov), 0.5);
00690     return (Hpr); 
00691   }
00692 
00693 
00694   double
00695   PolySNP::Dnominator (void)
00700   {
00701     assert ( rep->_preprocessed );
00702     if(rep->_NumPoly==0) return strtod("NAN",NULL);
00703     double S = 0.0;
00704     if (rep->_totMuts)
00705       {
00706         S = double (NumMutations ());
00707       }
00708     else if (!(rep->_totMuts))
00709       {
00710         S = double (NumPoly ());
00711       }
00712     double a1, a2, b1, b2, c1, c2, e1, e2;
00713 
00714     a1 = a_sub_n ();
00715     a2 = b_sub_n ();
00716     b1 = (rep->_totsam + 1.0) / (3.0 * (rep->_totsam - 1.0));
00717     b2 = (2.0 * (pow (rep->_totsam, 2.0) 
00718                  + rep->_totsam + 3.0)) / (9.0 * rep->_totsam *
00719                                            (rep->_totsam - 1.0));
00720     c1 = b1 - 1.0 / a1;
00721     c2 = b2 - (rep->_totsam + 2.0) / (a1 * rep->_totsam) + a2 / pow (a1, 2.0);
00722     e1 = c1 / a1;
00723     e2 = c2 / (pow (a1, 2.0) + a2);
00724     double denominator = pow ((e1 * S + e2 * S * (S - 1.0)), 0.5);
00725     return (denominator);
00726   }
00727 
00728   void
00729   PolySNP::DepaulisVeuilleStatistics (void)
00742   {
00743     assert ( rep->_preprocessed );
00744     if (!(rep->_CalculatedDandV))
00745       {
00746         if(rep->_NumPoly == 0)
00747           {
00748             rep->_DVK = 1;
00749             rep->_DVH = 0.;
00750             return;
00751           }
00752         if (rep->_data->size() > 0)
00753           {
00754             //step 1 : determine which sequences are unique in the data,
00755             //exluding missing data
00756             std::set<string,uniqueSeq> unique_haplotypes;
00757             if (rep->_haveOutgroup)
00758               {
00759                 unique_haplotypes.insert(rep->_data->begin(),
00760                                          rep->_data->begin()+rep->_outgroup);
00761                 unique_haplotypes.insert(rep->_data->begin()+rep->_outgroup+1,
00762                                          rep->_data->end());
00763               }
00764             else
00765               {
00766                 unique_haplotypes.insert(rep->_data->begin(),
00767                                          rep->_data->end());
00768               }
00769             //now do the real work
00770             std::set<string,uniqueSeq>::const_iterator beg = unique_haplotypes.begin(),
00771               end = unique_haplotypes.end();
00772             rep->_DVK = unique_haplotypes.size();
00773             while(beg != end)
00774               {
00775                 unsigned _count = 0;
00776                 if (rep->_haveOutgroup)
00777                   {
00778                     _count += std::count_if(rep->_data->begin(),
00779                                             rep->_data->begin()+rep->_outgroup,
00780                                             boost::bind(notDifferent<string>,
00781                                                         _1,*beg));
00782                     _count += std::count_if(rep->_data->begin()+rep->_outgroup+1,
00783                                             rep->_data->end(),
00784                                             boost::bind(notDifferent<string>,
00785                                                         _1,*beg));
00786                   }
00787                 else
00788                   {
00789                     _count += std::count_if(rep->_data->begin(),
00790                                             rep->_data->end(),
00791                                             boost::bind(notDifferent<string>,
00792                                                         _1,*beg));
00793                   }
00794                 rep->_DVH -= pow (double (_count) / rep->_totsam, 2.0);
00795                 ++beg;
00796               }
00797             rep->_DVH *= rep->_totsam / (rep->_totsam - 1.0);
00798             rep->_CalculatedDandV = 1;
00799           }
00800       }
00801   }
00802 
00803   double PolySNP::WallsB(void)
00809   {
00810     assert ( rep->_preprocessed );
00811     if (rep->_calculated_wall_stats == false)
00812       {
00813         WallStats();
00814       }
00815     return rep->_walls_B;
00816   }
00817 
00818   void PolySNP::WallStats(void)
00819   {
00820     assert ( rep->_preprocessed );
00821     unsigned S = 0;
00822     //explicity count # of bi-allelic sites,
00823     //since that's the proper denominator
00824     for( std::vector<stateCounter>::const_iterator itr = rep->_counts.begin() ;
00825          itr < rep->_counts.end();
00826          ++itr)
00827       {
00828         if ( itr->nStates() == 2 && itr->gap == 0)
00829           ++S;
00830       }
00831     if (S > 1)
00832       {
00833         unsigned nhap_curr, nhap_left;
00834 
00835         nhap_left = SEQMAXUNSIGNED;
00836 
00837         unsigned A = 0;//number of partitions with D' = 1 (see Wall 1999)
00838         //iterate over sites (actually, adjacent pairs of sites)
00839         for (unsigned site1 = 0 ; site1 < rep->_nsites-1 ; ++site1)
00840           {
00841             for(unsigned site2=site1+1 ; site2 < rep->_nsites ; ++site2)
00842               {
00843                 if ( rep->_counts[site1].nStates() == 2
00844                      && rep->_counts[site2].nStates() == 2 )
00845                   {
00846                     std::string config;
00847                     config.resize(2);
00848                     std::set<string,uniqueSeq> unique_haplotypes;
00849                     nhap_curr = 0;
00850                     for (unsigned i = 0 ; i < rep->_nsam ; ++i)
00851                       {
00852                         if ( (!rep->_haveOutgroup) || (rep->_haveOutgroup && i != rep->_outgroup) )
00853                           {
00854                             config[0] = (*rep->_data)[i][site1];
00855                             config[1] = (*rep->_data)[i][site2];
00856                             unique_haplotypes.insert(config);
00857                           }
00858                       }
00859                     nhap_curr = unique_haplotypes.size();
00860                     if(site1==0)
00861                       {
00862                         if (nhap_curr == 2)
00863                           {
00864                             ++rep->_walls_Bprime;
00865                             ++A;
00866                           }
00867                       }
00868                     else
00869                       {
00870                         if (nhap_curr == 2)
00871                           ++rep->_walls_Bprime;
00872                         if (nhap_curr == 2 && nhap_left != 2)
00873                           ++A;
00874                       }
00875                     nhap_left = nhap_curr;
00876                     site1=site2;
00877                   }
00878               }
00879           }
00880         rep->_walls_B = double(rep->_walls_Bprime)/(double(S-1));
00881         rep->_walls_Q = (double(rep->_walls_Bprime) + double(A))/(double(S));
00882       }
00883     else 
00884       {
00885         rep->_walls_B = strtod("NAN",NULL);
00886         rep->_walls_Bprime = 0;
00887         rep->_walls_Q = strtod("NAN",NULL);
00888       }
00889     rep->_calculated_wall_stats=true;
00890   }
00891 
00892 
00893   unsigned PolySNP::WallsBprime(void)
00899   {
00900     assert ( rep->_preprocessed );
00901     if (rep->_calculated_wall_stats == false)
00902       {
00903         WallStats();
00904       }
00905     return rep->_walls_Bprime;
00906   }
00907 
00908   double PolySNP::WallsQ(void)
00914   {
00915     assert ( rep->_preprocessed );
00916     if (rep->_calculated_wall_stats == false)
00917       {
00918         WallStats();
00919       }
00920     return rep->_walls_Q;
00921   }
00922 
00923   double
00924   PolySNP::VarPi (void)
00929   {
00930     double Pi = ThetaPi ();
00931     double variance = 3.0 * rep->_totsam * (rep->_totsam + 1.0) * Pi +
00932       2.0 * (pow (rep->_totsam, 2.0) + rep->_totsam + 3.0) * pow (Pi, 2.0);
00933     variance /= (11.0 * pow (rep->_totsam, 2.0) - 7.0 * rep->_totsam + 6.0);
00934     return (variance);
00935   }
00936 
00937   double
00938   PolySNP::StochasticVarPi (void)
00943   {
00944     double Pi = ThetaPi ();
00945     double variance = (3.0 * pow (rep->_totsam, 2.0) - 3.0 * rep->_totsam + 2.0) * Pi +
00946       2.0 * rep->_totsam * (rep->_totsam - 1.0) * pow (Pi, 2.0);
00947     variance /= (11.0 * pow (rep->_totsam, 2.0) - 7.0 * rep->_totsam + 6.0);
00948     return (variance);
00949   }
00950 
00951   double
00952   PolySNP::SamplingVarPi (void)
00958   {
00959     double Pi = ThetaPi ();
00960     double variance =
00961       2.0 * (3.0 * rep->_totsam - 1.0) * Pi + 2.0 * (2.0 * rep->_totsam +
00962                                                      3.0) * pow (Pi, 2.0);
00963     variance /= (11.0 * pow (rep->_totsam, 2.0) - 7.0 * rep->_totsam + 6.0);
00964     return (variance);
00965   }
00966 
00967 
00968   double
00969   PolySNP::VarThetaW (void)
00974   {
00975     double a1 = a_sub_n ();
00976     double a2 = b_sub_n ();
00977     double S = (rep->_totMuts) ? NumMutations() : NumPoly();
00978     double variance = pow (a1, 2.0) * S + a2 * pow (S, 2.0);
00979     variance /= pow (a1, 2.0) * (pow (a1, 2.0) + a2);
00980     return (variance);
00981   }
00982 
00983   //correct
00984   double
00985   PolySNP::FuLiD (void)
00991   {
00992     assert ( rep->_preprocessed );
00993     //    assert(rep->_haveOutgroup == true);
00994     if(rep->_NumPoly==0 || !rep->_haveOutgroup) return strtod("NAN",NULL);
00995     double D = 0.0;
00996     double ExternalMutations =
00997       double (NumExternalMutations ());
00998     double NumMut = double (NumMutations ());
00999     double a = a_sub_n ();
01000     double b = b_sub_n ();
01001     double c = c_sub_n ();
01002     double vD = 1.0 +
01003       (pow (a, 2.0) / (b + pow (a, 2.0)) *
01004        (c - (rep->_totsam + 1.0) / (rep->_totsam - 1.0)));
01005     double uD = a - 1.0 - vD;
01006     D = NumMut - a * double (ExternalMutations);
01007     D /= pow ((uD * NumMut + vD * pow (NumMut, 2.0)), 0.5);
01008     return (D);
01009   }
01010 
01011 
01012   //correct
01013   double
01014   PolySNP::FuLiF (void)
01020   {
01021     assert ( rep->_preprocessed );
01022     if(rep->_NumPoly==0 || !rep->_haveOutgroup) return strtod("NAN",NULL);
01023     double F = 0.0;
01024     double Pi = ThetaPi ();
01025     double NumMut = double (NumMutations());
01026     double ExternalMutations =
01027       double (NumExternalMutations ());
01028     double a = a_sub_n ();
01029     double a_n_plus1 = a_sub_n_plus1 ();
01030     double b = b_sub_n ();
01031     double c = c_sub_n ();
01032     double vF = c + 2.0 * (pow (rep->_totsam, 2.0) + rep->_totsam +
01033                            3.0) / (9.0 * rep->_totsam * (double (rep->_totsam - 1.0)));
01034     vF -= (2.0 / (rep->_totsam - 1.0));
01035     vF /= (pow (a, 2.0) + b);
01036 
01037     double uF = 1.0 + (rep->_totsam + 1.0) / (3.0 * (double (rep->_totsam - 1.0)));
01038     uF -= 4.0 * ((rep->_totsam + 1.0) / (pow (rep->_totsam - 1.0, 2.0))) *
01039       (a_n_plus1 - 2.0 * rep->_totsam / (rep->_totsam + 1.0));
01040     uF /= a;
01041     uF -= vF;
01042 
01043     F = Pi - ExternalMutations;
01044     F /= pow (uF * NumMut + vF * pow (NumMut, 2.0), 0.5);
01045     return (F);
01046   }
01047 
01048   //correct
01049   double
01050   PolySNP::FuLiDStar (void)
01055   {
01056     assert ( rep->_preprocessed );
01057     if(rep->_NumPoly==0) return strtod("NAN",NULL);
01058     double DStar = 0.0;
01059     double Singletons =
01060       double (NumSingletons ());
01061     double NumMut = double (NumMutations ());
01062 
01063     double a = a_sub_n ();
01064     double b = b_sub_n ();
01065     double d = d_sub_n ();
01066 
01067     double vD = pow (rep->_totsam / (rep->_totsam - 1.0), 2.0) * b;
01068     vD += pow (a, 2.0) * d;
01069     vD -= 2.0 * (rep->_totsam * a * (a + 1.0)) /
01070       (pow (double (rep->_totsam - 1.0), 2.0));
01071     vD /= (pow (a, 2.0) + b);
01072 
01073     double uD =
01074       (rep->_totsam / (rep->_totsam - 1.0)) * (a -
01075                                                (rep->_totsam / (rep->_totsam - 1.0))) - vD;
01076 
01077     DStar = (rep->_totsam / (rep->_totsam - 1.0)) * NumMut - a * double (Singletons);
01078     DStar /= pow (uD * NumMut + vD * pow (NumMut, 2.0), 0.5);
01079     return (DStar);
01080   }
01081 
01082   //correct
01083   double
01084   PolySNP::FuLiFStar (void)
01091   {
01092     assert ( rep->_preprocessed );
01093     if(rep->_NumPoly==0) return strtod("NAN",NULL);
01094     double FStar = 0.0;
01095     double Singletons =
01096       double (NumSingletons ());
01097     double Pi = ThetaPi ();
01098     double NumMut = double (NumMutations ());
01099 
01100     double a = a_sub_n ();
01101     double a_n_plus1 = a_sub_n_plus1 ();
01102     double b = b_sub_n ();
01103     //vF is taken from the correction published by
01104     //Simonsen et al.  (1995) Genetics 141: 413, eqn A5
01105     double vF = 2.0 * pow (rep->_totsam, 3.0) + 110.0 * pow (rep->_totsam,
01106                                                              2.0) -
01107       255.0 * rep->_totsam + 153.0;
01108     vF /= (9.0 * pow (rep->_totsam, 2.0) * (rep->_totsam - 1.0));
01109     vF += (((2.0 * (rep->_totsam - 1.0) * a) / pow (rep->_totsam, 2.0)) -
01110            (8.0 * b / rep->_totsam));
01111     vF /= (pow (a, 2.0) + b);
01112 
01113     double uF =
01114       (4.0 * pow (rep->_totsam, 2.0) + 19.0 * rep->_totsam + 3.0 -
01115        12.0 * (rep->_totsam + 1.0) * a_n_plus1);
01116     uF /= (3.0 * rep->_totsam * (rep->_totsam - 1.0));
01117     uF /= a;
01118     uF -= vF;
01119     FStar = Pi - (((rep->_totsam - 1.0) / rep->_totsam)) * double (Singletons);
01120     FStar /= pow ((uF * NumMut + vF * pow (NumMut, 2.0)), 0.5);
01121     return (FStar);
01122   }
01123 
01124   double
01125   PolySNP::a_sub_n (void)
01131   {
01132     assert ( rep->_preprocessed );
01133     int i;
01134     double a = 0.0;
01135     for (i = 1; i < int (rep->_totsam); ++i)
01136       a += 1. / double (i);
01137     return a;
01138   }
01139 
01140   double
01141   PolySNP::a_sub_n_plus1 (void)
01146   {                             //used by Fu and Li tests
01147     assert ( rep->_preprocessed );
01148     int i;
01149     double a = 0.0;
01150     for (i = 1; i < int (rep->_totsam) + 1; ++i)
01151       {
01152         a += 1. / double (i);
01153       }
01154     return (a);
01155   }
01156 
01157   double
01158   PolySNP::b_sub_n (void)
01163   {                             // sum of 1/i^2
01164     assert ( rep->_preprocessed );
01165     int i;
01166     double b = 0.0;
01167     for (i = 1; i < int (rep->_totsam); ++i)
01168       b += 1. / (pow (double (i), 2.0));
01169     return b;
01170   }
01171 
01172   double
01173   PolySNP::b_sub_n_plus1(void)
01179   {                             // sum of 1/i^2
01180     assert ( rep->_preprocessed );
01181     int i;
01182     double b = 0.0;
01183     for (i = 1; i < int (rep->_totsam) + 1; ++i)
01184       b += 1. / (pow (double (i), 2.0));
01185     return b;
01186   }
01187 
01188   double
01189   PolySNP::c_sub_n (void)
01199   {                             //from Fu and Li 93
01200     assert ( rep->_preprocessed );
01201     double c = 0.0, a = a_sub_n ();
01202     if (fabs(rep->_totsam-2.) <= DBL_EPSILON)
01203       {
01204         c = 1.0;
01205       }
01206     else
01207       {
01208         c = 2.0 * (rep->_totsam * a - 2.0 * (rep->_totsam - 1.0));
01209         c /= ((rep->_totsam - 1.0) * (rep->_totsam - 2.0));
01210       }
01211     return c;
01212   }
01213 
01214   double
01215   PolySNP::d_sub_n (void)
01220   {                             //from Fu and Li 93
01221     assert ( rep->_preprocessed );
01222     double a_n_plus1, c, d;
01223     a_n_plus1 = a_sub_n_plus1 ();
01224     c = c_sub_n ();
01225     d = c + (rep->_totsam - 2.0) / (pow (rep->_totsam - 1.0, 2.0));
01226     d += (2.0 / (rep->_totsam - 1.0)) *
01227       (1.5 -  ((2.0 * a_n_plus1 - 3.0) / (rep->_totsam - 2.0)) -  1.0 / rep->_totsam);
01228     return d;
01229   }
01230 
01231   double
01232   PolySNP::DandVH (void)
01241   {
01242     if (!(rep->_CalculatedDandV))
01243       DepaulisVeuilleStatistics ();
01244 
01245     return rep->_DVH;
01246   }
01247 
01248   unsigned
01249   PolySNP::DandVK (void)
01258   {
01259     if (!(rep->_CalculatedDandV))
01260       DepaulisVeuilleStatistics ();
01261 
01262     return rep->_DVK;
01263   }
01264 
01265   double
01266   PolySNP::HudsonsC (void)
01274   {
01275     assert ( rep->_preprocessed );
01276     if(rep->_NumPoly==0) return strtod("NAN",NULL);
01277     return(Recombination::HudsonsC (rep->_data, rep->_haveOutgroup, rep->_outgroup));
01278   }
01279 
01280 
01281   unsigned
01282   PolySNP::Minrec (void)
01289   {
01290     assert ( rep->_preprocessed );
01291     if(rep->_NumPoly<2) return SEQMAXUNSIGNED;
01292     unsigned a,b,e,numgametes,Rmin=0,x=0;
01293     bool flag=false;
01294     
01295     char c11,c12,c21,c22;
01296     unsigned states1=0,states2=0;
01297     
01298     for (a=x+1 ; a < rep->_nsites ; ++a)
01299       {
01300         c11 = c12 = 'Z'; //Z is a dummy value
01301         //count # states in site a
01302         states1 = rep->_counts[a].nStates();
01303 
01304         c11 = (c11 == 'Z' && rep->_counts[a].a > 0 ) ? 'A' : 'Z';
01305         c11 = (c11 == 'Z' && rep->_counts[a].g > 0 ) ? 'G' : c11;
01306