Source code for seamm_util.md_statistics

# -*- coding: utf-8 -*-

"""Helpful routine using the underlying Python functions
in 'statistics' and 'statsmodels' to handle common needs
in molecular modeling
"""

import logging
import random
import statistics
import statsmodels.tsa.stattools as stattools

logger = logging.getLogger(__name__)



[docs]
def analyze_autocorrelation(y, interval=1, nlags=64, method="zr", use_confidence=False):
    """Find the statistical inefficiency, correlation time and other useful
    parameters given the time sequence of values 'y'.

    This function implements two approaches, 'zr' and 'cspsd' corresponding to
    the two papers (using the initials as the method name):

    zr: Zięba, A.; Ramza, P. Standard Deviation of the Mean of Autocorrelated
    Observations Estimated with the Use of the Autocorrelation Function
    Estimated From the Data. Metrology and Measurement Systems 2011, 18 (4),
    3–14.

    cspsd: Chodera, J. D.; Swope, W. C.; Pitera, J. W.; Seok, C.; Dill,
    K. A. Use of the Weighted Histogram Analysis Method for the Analysis of
    Simulated and Parallel Tempering Simulations. J. Chem. Theory Comput. 2006,
    3 (1), 26–41.

    Args:
        y ([float]): the time sequence to analyze
        interval (float): the time interval between values, defaults to 1
        nlags (int): the number of lags to start with
        method (str): the approach to use, 'zr' or 'cspsd'. Defaults to 'zr'

    Returns:
        dict(
            'n': (int) number of values in y
            'n_effective': (int) effective number of uncorrelated values
            'n_c': (int) the first zero crossing in the ACF
            'n_tau': (int) number of intervals in the correlation time
            'tau': (float) the correlation time if <interval> is correct
            'inefficiency': (float) the statistical inefficiency
            'acf': ([float]) the autocorrelation function
            'confidence_interval': [(float, float)] the 95% confidence interval

    The input parameter <nlags> is solely for performance. Since calculating
    the ACF is expensive (O(N^2)) and often the correlation time is short, at
    least if the trajectory is sampled reasonably, this function starts with a
    small number of lags, and doubles the number repeatedly if it does not find
    the zero crossing it expects.
    """

    # Find the autocorrelation time...
    n = len(y)

    logger.debug("analyze_autocorrelation for a vector of length {}".format(n))

    if nlags >= n:
        nlags = n - 1

    while True:
        logger.debug("   nlags = {}".format(nlags))

        acf, confidence = stattools.acf(
            y, nlags=nlags, alpha=0.05, fft=nlags > 16, unbiased=False
        )

        # remove the first items, which are 1 by definition
        acf = acf[1:]
        confidence = confidence[1:]

        # Find the last lag that is > 0
        sum_acf = 0.0
        n_c = 0
        if use_confidence:
            for lower, upper in confidence:
                delta = acf[n_c] - lower
                value = lower + delta * 3 / 4
                if nlags == n - 1:
                    print(value)
                if value < 0:
                    break
                sum_acf += acf[n_c]
                n_c += 1
        else:
            for value in acf:
                if nlags == n - 1:
                    print(value)
                if value < 0:
                    break
                sum_acf += value
                n_c += 1

        # If we found n_c, go on. Otherwise double the lags
        if value < 0:
            break
        else:
            if nlags == n - 1:
                raise RuntimeError(
                    "analyze_autocorrelation: Serious error! "
                    "Did not find negative autocorrelation value."
                )
            nlags *= 2
            if nlags >= n:
                nlags = n - 1

    logger.debug("   n_c = {}".format(n_c))

    if method == "zr":
        # Use the approach of Zięba and Ramza
        n_eff = (n - 2 * n_c - 1 + n_c * (n_c + 1) / n) / (1 + 2 * sum_acf)
        inefficiency = n / n_eff
        n_tau = (inefficiency - 1) / 2
    else:
        # Use the approach of Chodera, Swope, Pitera, Seok and Dill
        n_tau = 0.0
        for t in range(0, n_c):
            n_tau += (1 - (t + 1) / n) * acf[t]

        inefficiency = 1 + 2 * n_tau
        n_eff = n / inefficiency

    tau = n_tau * interval

    result = {
        "n": n,
        "n_effective": n_eff,
        "n_c": n_c,
        "n_tau": n_tau,
        "tau": tau,
        "inefficiency": inefficiency,
        "acf": acf,
        "confidence_interval": confidence,
    }

    return result




[docs]
def ar1(n=1000, a=10.0, b=0.2, sigma=0.5, seed=None):
    """Generate an AR(1) series of length n"""
    r = random.Random()
    r.seed(seed)

    y0 = a / (1 - b)
    y = [0.0] * n
    y[0] = a + b * y0 + sigma * (r.random() - 0.5)
    for i in range(1, n):
        y[i] = a + b * y[i - 1] + sigma * (r.random() - 0.5)

    return y



if __name__ == "__main__":
    print("in end section")
    import time

    print()

    a = 10.0
    b = 0.9
    for n in (15, 60, 240, 512, 1024, 2048, 10000, 100000):
        y = ar1(n, a=a, b=b, seed=52)
        t0 = time.time()
        r1 = analyze_autocorrelation(y, method="zr", nlags=4)
        t1 = time.time()
        r2 = analyze_autocorrelation(y, method="zr", nlags=4, use_confidence=True)
        t2 = time.time()

        print("{:>20s} = {:7.3f} {:7.3f}".format("time", t1 - t0, t2 - t1))
        print("{:>20s} = {} {}".format("nlags", len(r1["acf"]), len(r2["acf"])))
        for key in r1:
            if key not in ["acf", "confidence_interval"]:
                print("{:>20s} = {:7.1f} {:7.1f}".format(key, r1[key], r2[key]))

        ave = statistics.mean(y)
        stdev = statistics.stdev(y)
        print("                mean = {:9.3f}".format(a / (1 - b)))
        print("                 ave = {:9.3f}".format(ave))
        print("               stdev = {:9.3f}".format(stdev))
        print()