University of Connecticut University of UC Title Fallback Connecticut

The GTRModel class

By way of a data member of the Likelihood class, the GTRModel class will supply BeagleLib with the eigenvectors, eigenvalues, and relative rates it needs in order to compute the likelihood of a tree. The Eigen library is used to compute the eigenvectors and eigenvalues for a given combination of exchangeabilities and nucleotide frequencies.

Begin by creating a new header file named gtr_model.hpp to contain the GTRModel class declaration and member function bodies.

#pragma once

#include <algorithm>
#include <vector>
#include "libhmsbeagle/beagle.h"
#include <boost/math/distributions/gamma.hpp>
#include <Eigen/Dense>

namespace strom
    {

    class GTRModel
        {
        template <class T> friend class Likelihood;

        public:
            typedef Eigen::Matrix<double, 4, 4, Eigen::RowMajor>    EigenMatrix4d;
            typedef Eigen::Vector4d                                 EigenVector4d;
        
        
                                        GTRModel();
                                        ~GTRModel();
            
            std::string                 describeModel() const;

            void                        setGammaShape(double shape);
            void                        setGammaNCateg(unsigned ncateg);
            void                        setExchangeabilities(const std::vector<double> & exchangeabilities);
            void                        setStateFreqs(const std::vector<double> & state_frequencies);
            void                        setExchangeabilitiesAndStateFreqs(const std::vector<double> & exchangeabilities, const std::vector<double> & state_frequencies);

            int                         setBeagleEigenDecomposition(int beagle_instance);
            int                         setBeagleStateFrequencies(int beagle_instance);
            int                         setBeagleAmongSiteRateVariationRates(int beagle_instance);
            int                         setBeagleAmongSiteRateVariationProbs(int beagle_instance);

                                        EIGEN_MAKE_ALIGNED_OPERATOR_NEW

            typedef boost::shared_ptr< GTRModel > SharedPtr;

        private:

            void                        clear();
            void                        recalcRateMatrix();
            void                        recalcGammaRates();
        
            // substitution model specification
            std::vector<double>         _state_freqs;
            std::vector<double>         _exchangeabilities;
            
            // for computing eigenvectors/eigenvalues
            EigenMatrix4d               _sqrtPi;
            EigenMatrix4d               _sqrtPiInv;
            EigenMatrix4d               _qmatrix;
            EigenMatrix4d               _eigenvectors;
            EigenMatrix4d               _inverse_eigenvectors;
            EigenVector4d               _eigenvalues; 

            // among-site rate heterogeneity specification
            unsigned                    _num_categ;
            double                      _gamma_shape;
            std::vector<double>         _relative_rates;
            std::vector<double>         _categ_boundaries;
            std::vector<double>         _rate_probs; 

        };
    
    // member function bodies go here
    }

Constructor, destructor, and clear functions

The constructor and destructor are very simple and similar to all the other constructors and destructors you have created thus far. The clear function is slightly more complicated, however. It sets up the GTRModel class so that it behaves exactly like the JC69 model we have been using. The number of among-site rate categories is set to 1, the gamma shape is set to 0.5 (which is arbitrary because one rate category means that rate homogeneity is being assumed), the 6 exchangeabilities are all set to 1, the 4 nucleotide frequencies are all set to 0.25, and the relative rate distribution comprises a single representative relative rate equal to 1.0 that has probability 1.0. The clear function finishes by calling recalcRateMatrix, which computes eigenvalues and eigenvectors.

inline GTRModel::GTRModel()
    {
    clear();
    }

inline GTRModel::~GTRModel()
    {
    }

inline void GTRModel::clear()
    {
    _num_categ = 1;
    _gamma_shape = 0.5;

    // Set up GTR rate matrix representing the JC69 model by default
    setExchangeabilitiesAndStateFreqs({1.0, 1.0, 1.0, 1.0, 1.0, 1.0}, {0.25, 0.25, 0.25, 0.25});

    _relative_rates.assign(1, 1.0);
    _categ_boundaries.assign(1, 0.0);
    _rate_probs.assign(1, 1.0);
    
    recalcRateMatrix();
    }

The describeModel member function

This function may be used to reveal the current state of the model. It constructs and returns a string, which can be output to either cout or to a file.

inline std::string GTRModel::describeModel() const
    {
    std::string s;
    s += "\n----------------- GTR Model Info ------------------\n";
    s += boost::str(boost::format("State frequencies:   \n  piA = %g\n  piC = %g\n  piG = %g\n  piT = %g") % _state_freqs[0] % _state_freqs[1] % _state_freqs[2] % _state_freqs[3]);
    s += boost::str(boost::format("\nRelative rates:    \n  rAC = %g\n  rAG = %g\n  rAT = %g\n  rCG = %g\n  rCT = %g\n  rGT = %g") % _exchangeabilities[0] % _exchangeabilities[1] % _exchangeabilities[2] % _exchangeabilities[3] % _exchangeabilities[4] % _exchangeabilities[5]);
    s += boost::str(boost::format("\nRate categories:   \n  %d") % _num_categ);
    s += boost::str(boost::format("\nGamma shape:       \n  %g") % _gamma_shape);
    if (_num_categ > 1)
        {
        s += "\nCategory boundaries and relative rate means:";
        s += boost::str(boost::format("\n%12s %12s %12s %12s") % "category" %  "lower" % "upper" % "mean rate");
        for (unsigned i = 1; i < _num_categ; ++i)
            s += boost::str(boost::format("\n%12d %12.5f %12.5f %12.5f") % i % _categ_boundaries[i-1] % _categ_boundaries[i] % _relative_rates[i-1]);
        s += boost::str(boost::format("\n%12d %12.5f %12s %12.5f") % _num_categ % _categ_boundaries[_num_categ-1] % "infinity" % _relative_rates[_num_categ-1]);
        }
    s += "\n---------------------------------------------------\n";
    return s;
    }

The setGammaNCateg, setGammaShape, setExchangeabilities and setStateFreqs member functions

These 4 functions set the value of the data members _num_categ, _gamma_shape, _exchangeabilities, and _state_freqs, respectively. The reason we need "setter" functions for these is that changing any one of these requires calling either recalcGammaRates() (_num_categ and _gamma_shape) or recalcRateMatrix() (_exchangeabilities and _state_freqs), so making these variable private and creating setter functions for them ensures that this essential additional task is always performed. In addition to setting the value of _state_freqs, the setStateFreqs member function also computes values for data members _sqrtPi and _sqrtPiInv, which are used in recalcRateMatrix() to facilitate eigenvector/eigenvalue decomposition.

inline void GTRModel::setGammaNCateg(unsigned ncateg)
    {
    if (ncateg < 1)
        throw XStrom(boost::str(boost::format("number of categories used for among-site rate variation must be greater than zero but the value %d was supplied") % ncateg));
    _num_categ = ncateg;
    recalcGammaRates();
    }

inline void GTRModel::setGammaShape(double shape)
    {
    if (shape <= 0.0)
        throw XStrom(boost::str(boost::format("gamma shape must be greater than zero but the value %.5f was supplied") % shape));
    _gamma_shape = shape;
    recalcGammaRates();
    }

inline void GTRModel::setExchangeabilities(const std::vector<double> & exchangeabilities)
    {
    _exchangeabilities.assign(exchangeabilities.begin(), exchangeabilities.end());
    recalcRateMatrix();
    }

inline void GTRModel::setStateFreqs(const std::vector<double> & state_frequencies)
    {
    _state_freqs.assign(state_frequencies.begin(), state_frequencies.end());
    
    Eigen::Map<const Eigen::Array4d> tmp(&_state_freqs[0]);
    _sqrtPi = tmp.sqrt().matrix().asDiagonal();
    _sqrtPiInv = _sqrtPi.inverse();
    
    recalcRateMatrix();
    }

inline void GTRModel::setExchangeabilitiesAndStateFreqs(const std::vector<double> & exchangeabilities, const std::vector<double> & state_frequencies)
    {
    _exchangeabilities.assign(exchangeabilities.begin(), exchangeabilities.end());
    
    _state_freqs.assign(state_frequencies.begin(), state_frequencies.end());
    
    Eigen::Map<const Eigen::Array4d> tmp(&_state_freqs[0]);
    _sqrtPi = tmp.sqrt().matrix().asDiagonal();
    _sqrtPiInv = _sqrtPi.inverse();
    
    recalcRateMatrix();
    }

The recalcRateMatrix member function

This function requires more explanation than any other, and I’ve broken down the discussion into sections.

GTR rate matrix

The instantaneous rate matrix Q for the GTR model has 6 exchangeability parameters (a, b, c, d, e and f) and 4 nucleotide frequencies (πA, πC, πG and πT). It is stored in the data member _qmatrix.

Q matrix for GTR model

The rows represent the "from" state (in the order A, C, G, T, from top to bottom), while the columns represent the "to" state (also in the order A, C, G, T, from left to right). The diagonal elements of this matrix are negative and equal to the sum of the other elements in the same row because any change from state A, for example, to a different state must reduce the amount of A, hence the rate of change is negative. The row sums are zero because this model assumes that the sequence length neither shrinks nor grows, and thus any increase in C, G, or T must occur at the expense of A.

Matrices related to nucleotide frequencies

Two important diagonal matrices are defined by the vector of nucleotide frequencies stored in _state_freqs. These are shown below and are stored in the data members _sqrtPi (top) and _sqrtPiInv (bottom).

sqrtPi and sqrtPiInv matrices

Diagonalization of the rate matrix

The rate matrix Q can be represented as the matrix product of the (right) eigenvector matrix V, the diagonal matrix of eigenvalues L, and the inverse eigenvector matrix.

diagonalization of the rate matrix Q

Computing the transition probability matrix

The transition probability matrix P is obtained by exponentiating the product of Q and time t Qt. The calculation of P involves exponentiating only the middle matrix; the eigenvectors are unchanged by this rescaling.

diagonalization of the transition probability matrix P

Scaling the rate matrix

The Q matrix must be scaled so that time t is measured in expected number of substitutions. If this scaling is not performed, the edge lengths lose their interpretation as expected number of substitutions per site. The scaling factor needed may be obtained by effectively performing the following matrix multiplication and afterwards summing the off-diagonal elements: dividing each element of Q by this sum yields the desired normalization.

Q matrix scaling

Symmetrizing the rate matrix

Computing eigenvalues and eigenvectors of a symmetric matrix is simpler and more efficient than computing them for an asymmetric matrix. Hence, it is common to obtain eigenvalues and eigenvectors of a symmetrical matrix S derived from the Q matrix rather than for the Q matrix itself. The eigenvalues for S and Q are identical, and the eigenvector matrix W obtained from S may be easily converted into the desired matrix V containing the eigenvectors of Q. Note that V may be obtained by premultiplying W by the diagonal matrix of inverse square roots of nucleotide frequencies.

diagonalization of the symmetrized rate matrix S

The recalcRateMatrix function

With all that in mind, hopefully the recalcRateMatrix will be easier to understand.

inline void GTRModel::recalcRateMatrix()
    {
    double piA = _state_freqs[0];
    double piC = _state_freqs[1];
    double piG = _state_freqs[2];
    double piT = _state_freqs[3];
    
    double rAC = _exchangeabilities[0];
    double rAG = _exchangeabilities[1];
    double rAT = _exchangeabilities[2];
    double rCG = _exchangeabilities[3];
    double rCT = _exchangeabilities[4];
    double rGT = _exchangeabilities[5];

    double inverse_scaling_factor = piA*(rAC*piC + rAG*piG + rAT*piT) + piC*(rAC*piA + rCG*piG + rCT*piT) + piG*(rAG*piA + rCG*piC + rGT*piT) + piT*(rAT*piA + rCT*piC + rGT*piG);
    double scaling_factor = 1.0/inverse_scaling_factor;

    _qmatrix(0,0) = -scaling_factor*(rAC*piC + rAG*piG + rAT*piT);
    _qmatrix(0,1) = scaling_factor*rAC*piC;
    _qmatrix(0,2) = scaling_factor*rAG*piG;
    _qmatrix(0,3) = scaling_factor*rAT*piT;
    
    _qmatrix(1,0) = scaling_factor*rAC*piA;
    _qmatrix(1,1) = -scaling_factor*(rAC*piA + rCG*piG + rCT*piT);
    _qmatrix(1,2) = scaling_factor*rCG*piG;
    _qmatrix(1,3) = scaling_factor*rCT*piT;
    
    _qmatrix(2,0) = scaling_factor*rAG*piA;
    _qmatrix(2,1) = scaling_factor*rCG*piC;
    _qmatrix(2,2) = -scaling_factor*(rAG*piA + rCG*piC + rGT*piT);
    _qmatrix(2,3) = scaling_factor*rGT*piT;
    
    _qmatrix(3,0) = scaling_factor*rAT*piA;
    _qmatrix(3,1) = scaling_factor*rCT*piC;
    _qmatrix(3,2) = scaling_factor*rGT*piG;
    _qmatrix(3,3) = -scaling_factor*(rAT*piA + rCT*piC + rGT*piG);

    // S is a symmetric matrix
    EigenMatrix4d S = EigenMatrix4d(_sqrtPi*_qmatrix*_sqrtPiInv);

    // Can use efficient eigensystem solver because S is symmetric
    Eigen::SelfAdjointEigenSolver<Eigen::Matrix4d> solver(S);
    if (solver.info() != Eigen::Success)
        {
        throw XStrom("Error in the calculation of eigenvectors and eigenvalues of the GTR rate matrix");
        }

    _eigenvectors           = _sqrtPiInv*solver.eigenvectors();
    _inverse_eigenvectors   = solver.eigenvectors().transpose()*_sqrtPi;
    _eigenvalues            = solver.eigenvalues();
    }

The recalcGammaRates member function

This function recomputes several vectors related to modeling discrete-gamma among-site rate variation. The relative rate of a nucleotide site is the ratio of its rate of substitution to the average rate. A relative rate equal to 1.0 means that the site is evolving at exactly the average rate, while a relative rate of 2.0 means that site is evolving at twice the average rate. Relative rates are assumed to follow a Gamma distribution with mean 1.0 and shape _gamma_shape. The Gamma distribution is partitioned into _num_categ subsets having equal area under the probability density curve (and thus equal probability), and the mean of each category serves as the relative rate representing that category. Rate heterogeneity is thus modeled as a discrete mixture distribution with each component of the mixture having equal probability (i.e. representing the same proportion of sites).

The mean of one category is computed as follows:
rate category mean calculation

This involves a ratio of integrals of two Gamma distributions. The Gamma distribution in the numerator has shape _gamma_shape + 1, while the Gamma distribution in the denominator has shape _gamma_shape. The values u and v define the boundaries of the category for which the mean is being calculated.

The code below makes use of quantile and cdf functions for the Gamma distribution provided by the Boost Math library.

inline void GTRModel::recalcGammaRates()
    {
    assert(_num_categ > 0);
    _relative_rates.assign(_num_categ, 1.0);
    _categ_boundaries.assign(_num_categ, 0.0);
    _rate_probs.assign(_num_categ, 1.0/_num_categ);

    if (_num_categ == 1)
        return;

    assert(_gamma_shape > 0.0);
    double alpha = _gamma_shape;
    double beta = 1.0/_gamma_shape;
    
    boost::math::gamma_distribution<> my_gamma(_gamma_shape, 1.0/_gamma_shape);
    boost::math::gamma_distribution<> my_gamma_plus(_gamma_shape + 1.0, 1.0/_gamma_shape);

    double cum_upper        = 0.0;
    double cum_upper_plus   = 0.0;
    double upper            = 0.0;
    double cum_prob         = 0.0;
    for (unsigned i = 1; i <= _num_categ; ++i)
        {
        double lower                = upper;
        double cum_lower_plus       = cum_upper_plus;
        double cum_lower            = cum_upper;
        cum_prob                    += _rate_probs[i-1];

        if (i < _num_categ)
            {
            upper                   = boost::math::quantile(my_gamma, cum_prob);
            cum_upper_plus          = boost::math::cdf(my_gamma_plus, upper);
            cum_upper               = boost::math::cdf(my_gamma, upper);
            }
        else
            {
            cum_upper_plus          = 1.0;
            cum_upper               = 1.0;
            }

        double numer                = cum_upper_plus - cum_lower_plus;
        double denom                = cum_upper - cum_lower;
        double r_mean               = (denom > 0.0 ? (alpha*beta*numer/denom) : 0.0);
        _relative_rates[i-1]        = r_mean;
        _categ_boundaries[i-1]      = lower;
        }
    }

The setBeagleEigenDecomposition member function

This function provides the eigenvectors, inverse eigenvectors and eigenvalues of the rate matrix to BeagleLib. The second argument is the index of the rate matrix being provided, which is 0 in our case because we are using the same rate matrix for all sites.

inline int GTRModel::setBeagleEigenDecomposition(int beagle_instance)
    {
    int code = beagleSetEigenDecomposition(
        beagle_instance,
        0,
        &_eigenvectors.data()[0],
        &_inverse_eigenvectors.data()[0],
        &_eigenvalues.data()[0]); 
        
    return code;
    }

The setBeagleStateFrequencies member function

This function provides the nucleotide (state) frequencies to BeagleLib. The second argument is the index of the frequency vector being provided, which is 0 in our case because we are using the same nucleotide frequency vector for all sites.

inline int GTRModel::setBeagleStateFrequencies(int beagle_instance)
    {
    int code = beagleSetStateFrequencies(
         beagle_instance, 
         0, 
         &_state_freqs[0]);

    return code;
    }

The setBeagleAmongSiteRateVariationRates member function

This function sends the vector of category rates to BeagleLib. The category rates are the relative rates representing each category in the discrete gamma mixture model.

inline int GTRModel::setBeagleAmongSiteRateVariationRates(int beagle_instance)
    {
    int code = beagleSetCategoryRates(
        beagle_instance,
        &_relative_rates[0]);

    return code;
    }

The setBeagleAmongSiteRateVariationProbs member function

This function sends the vector of category weights to BeagleLib. The category weights are the probabilities of each category in the discrete gamma mixture model. The probability of each category the same for the discrete gamma model; however, BeagleLib does not assume that the discrete gamma model is being used. It assumes only that relative rates are modeled using a discrete mixture model. It is up to you to provide the correct category weights (probabilities) and rates.

inline int GTRModel::setBeagleAmongSiteRateVariationProbs(int beagle_instance)
    {
    int code = beagleSetCategoryWeights(
        beagle_instance,
        0, 
        &_rate_probs[0]); 

    return code;
    }