Source code for historydag.utils

"""Utility functions and classes for working with HistoryDag objects."""

import ete3
from math import log, exp, isfinite
from collections import Counter
from functools import wraps
import operator
from collections import UserDict
from decimal import Decimal
from warnings import warn
from itertools import chain, combinations
from typing import (
    List,
    Any,
    TypeVar,
    Callable,
    Union,
    Iterable,
    Generator,
    Tuple,
    NamedTuple,
    Optional,
)
from typing import TYPE_CHECKING


try:
    from math import comb
except ImportError:

    def comb(n, k):
        """
        A fast way to calculate binomial coefficients
        from https://stackoverflow.com/a/3025547
        https://en.wikipedia.org/wiki/Binomial_coefficient#Multiplicative_formula
        """
        if 0 <= k <= n:
            ntok = 1
            ktok = 1
            for t in range(1, min(k, n - k) + 1):
                ntok *= n
                ktok *= t
                n -= 1
            return ntok // ktok
        else:
            return 0


if TYPE_CHECKING:
    from historydag.dag import HistoryDagNode, HistoryDag

Weight = Any
Label = Union[NamedTuple, "UALabel"]
F = TypeVar("F", bound=Callable[..., Any])



[docs]
class TaxaError(ValueError):
    pass




[docs]
class UALabel(str):
    _fields: Tuple = tuple()

    def __new__(cls):
        return super(UALabel, cls).__new__(cls, "UA_Node")

    def __eq__(self, other):
        return isinstance(other, UALabel)

    def __hash__(self):
        return hash("UA_Node")

    def __iter__(self):
        raise RuntimeError("Attempted to iterate from dag root UALabel")

    def _asdict(self):
        raise RuntimeError("Attempted to iterate from dag root UALabel")



# ######## Decorators ########

[docs]
def access_nodefield_default(fieldname: str, default: Any) -> Any:
    """A decorator for accessing label fields on a HistoryDagNode. Converts a
    function taking some label field's values as positional arguments, to a
    function taking HistoryDagNodes as positional arguments.

    Args:
        fieldname: The name of the label field whose value the function takes as arguments
        default: A value that should be returned if one of the arguments is the DAG UA node.

    For example, instead of
    `lambda n1, n2: default if n1.is_ua_node() or n2.is_ua_node() else func(n1.label.fieldname, n2.label.fieldname)`,
    this wrapper allows one to write `access_nodefield_default(fieldname, default)(func)`.
    """

    def decorator(func):
        @ignore_uanode(default)
        @access_field("label")
        @access_field(fieldname)
        @wraps(func)
        def wrapper(*args: Label, **kwargs: Any) -> Weight:
            return func(*args, **kwargs)

        return wrapper

    return decorator




[docs]
def access_field(fieldname: str) -> Callable[[F], F]:
    """A decorator for conveniently accessing a field in a label.

    To be used instead of something like `lambda l1, l2:
    func(l1.fieldname, l2.fieldname)`. Instead just write
    `access_field(fieldname)(func)`. Supports arbitrarily many
    positional arguments, which are all expected to be labels
    (namedtuples) with field `fieldname`.
    """

    def decorator(func: F):
        @wraps(func)
        def wrapper(*args: Label, **kwargs: Any) -> Any:
            newargs = [getattr(label, fieldname) for label in args]
            return func(*newargs, **kwargs)

        return wrapper

    return decorator




[docs]
def ignore_uanode(default: Any) -> Callable[[F], F]:
    """A decorator to return a default value if any argument is a UANode.

    For instance, to allow distance between two nodes to be zero if one
    is UANode
    """

    def decorator(func):
        @wraps(func)
        def wrapper(*args: "HistoryDagNode", **kwargs: Any):
            for node in args:
                if node.is_ua_node():
                    return default
            else:
                return func(*args, **kwargs)

        return wrapper

    return decorator




[docs]
def explode_label(labelfield: str):
    """A decorator to make it easier to expand a Label by a certain field.

    Args:
        labelfield: the name of the field whose contents the wrapped function is expected to
            explode

    Returns:
        A decorator which converts a function which explodes a field value, into a function
        which explodes the whole label at that field.
    """

    def decorator(
        func: Callable[[Any], Iterable[Any]]
    ) -> Callable[[Label], Iterable[Label]]:
        @wraps(func)
        def wrapfunc(label, *args, **kwargs):
            Label = type(label)
            d = label._asdict()
            for newval in func(d[labelfield], *args, **kwargs):
                d[labelfield] = newval
                yield Label(**d)

        return wrapfunc

    return decorator



# ######## Distances and comparisons... ########



[docs]
def cartesian_product(
    optionlist: List[Callable[[], Iterable]], accum=tuple()
) -> Generator[Tuple, None, None]:
    """The cartesian product of iterables in a list.

    Takes a list of functions which each return a fresh generator on
    options at that site, and returns a generator yielding tuples, which
    are elements of the cartesian product of the passed generators'
    contents.
    """
    if optionlist:
        for term in optionlist[0]():
            yield from cartesian_product(optionlist[1:], accum=(accum + (term,)))
    else:
        yield accum




[docs]
def hist(c: Counter, samples: int = 1):
    """Pretty prints a counter Normalizing counts using the number of samples,
    passed as the argument `samples`."""
    ls = list(c.items())
    ls.sort()
    print("Weight\t| Frequency\n------------------")
    for weight, freq in ls:
        print(f"{weight}  \t| {freq if samples==1 else freq/samples}")




[docs]
def is_collapsed(tree: ete3.TreeNode) -> bool:
    """Return whether the provided tree is collapsed.

    Collapsed means that any edge whose target is not a leaf node
    connects nodes with different sequences.
    """
    return not any(
        node.sequence == node.up.sequence and not node.is_leaf()
        for node in tree.iter_descendants()
    )




[docs]
def collapse_adjacent_sequences(tree: ete3.TreeNode) -> ete3.TreeNode:
    """Collapse nonleaf nodes that have the same sequence."""
    # Need to keep doing this until the tree fully collapsed. See gctree for this!
    tree = tree.copy()
    to_delete = []
    for node in tree.get_descendants():
        # This must stay invariably hamming distance, since it's measuring equality of strings
        if not node.is_leaf() and node.up.sequence == node.sequence:
            to_delete.append(node)
    for node in to_delete:
        node.delete()
    return tree




[docs]
class AddFuncDict(UserDict):
    """Container for function keyword arguments to
    :meth:`historydag.HistoryDag.weight_count`. This is primarily useful
    because it allows instances to be added. Passing the result to
    `weight_count` as keyword arguments counts the weights jointly. A
    :class:`historydag.utils.AddFuncDict` which may be passed as keyword
    arguments to :meth:`historydag.HistoryDag.weight_count`,
    :meth:`historydag.HistoryDag.trim_optimal_weight`, or
    :meth:`historydag.HistoryDag.optimal_weight_annotate` methods to trim or
    annotate a :meth:`historydag.HistoryDag` according to the weight that the
    contained functions implement.

    For example, `dag.weight_count(**(parsimony_utils.hamming_distance_countfuncs + make_newickcountfuncs()))`
    would return a Counter object in which the weights are tuples containing hamming parsimony and newickstrings.

    Args:
        initialdata: A dictionary containing functions keyed by "start_func", "edge_weight_func", and
            "accum_func". "start_func" specifies weight assigned to leaf HistoryDagNodes.
            "edge_weight_func" specifies weight assigned to an edge between two HistoryDagNodes, with the
            first argument the parent node, and the second argument the child node.
            "accum_func" specifies how to 'add' a list of weights. See :meth:`historydag.HistoryDag.weight_count`
            for more details.
        name: A string containing a name for the weight to be counted. If a tuple of weights will be returned,
            use ``names`` instead.
        names: A tuple of strings containing names for the weights to be counted, if a tuple of weights will
            be returned by passed functions. If only a single weight will be returned, use ``name`` instead.
    """

    requiredkeys = {"start_func", "edge_weight_func", "accum_func"}


[docs]
    def __init__(self, initialdata, name: str = None, names: Tuple[str] = None):
        self.name: Optional[str]
        self.names: Tuple[str]
        if name is not None and names is not None:
            raise ValueError(
                "Pass a value to either keyword argument 'name' or 'names'."
            )
        elif name is None and names is None:
            self.name = "unknown weight"
            self.names = (self.name,)
        elif name is not None:
            self.name = name
            self.names = (self.name,)
        elif names is not None:
            if not isinstance(names, tuple):
                raise ValueError("``names`` keyword argument expects a tuple.")
            self.names = names
            self.name = None
        if not set(initialdata.keys()) == self.requiredkeys:
            raise ValueError(
                "Must provide functions named " + ", ".join(self.requiredkeys)
            )
        super().__init__(initialdata)


    def __add__(self, other) -> "AddFuncDict":
        fdict1 = self._convert_to_tupleargs()
        fdict2 = other._convert_to_tupleargs()
        n = len(fdict1.names)

        def newaccumfunc(weightlist):
            return fdict1["accum_func"](
                [weight[0:n] for weight in weightlist]
            ) + fdict2["accum_func"]([weight[n:] for weight in weightlist])

        def addfuncs(func1, func2):
            def newfunc(*args):
                return func1(*args) + func2(*args)

            return newfunc

        return AddFuncDict(
            {
                "start_func": addfuncs(fdict1["start_func"], fdict2["start_func"]),
                "edge_weight_func": addfuncs(
                    fdict1["edge_weight_func"], fdict2["edge_weight_func"]
                ),
                "accum_func": newaccumfunc,
            },
            names=fdict1.names + fdict2.names,
        )

    def __str__(self) -> str:
        return f"AddFuncDict[{', '.join(str(it) for it in self.names)}]"

    def _convert_to_tupleargs(self):
        if self.name is not None:

            def node_to_weight_decorator(func):
                @wraps(func)
                def wrapper(*args):
                    return (func(*args),)

                return wrapper

            def list_of_weights_to_weight_decorator(func):
                @wraps(func)
                def wrapper(weighttuplelist: List[Weight]):
                    return (func([wt[0] for wt in weighttuplelist]),)

                return wrapper

            return AddFuncDict(
                {
                    "start_func": node_to_weight_decorator(self["start_func"]),
                    "edge_weight_func": node_to_weight_decorator(
                        self["edge_weight_func"]
                    ),
                    "accum_func": list_of_weights_to_weight_decorator(
                        self["accum_func"]
                    ),
                },
                names=(self.name,),
            )
        else:
            return self


[docs]
    def linear_combination(self, coeffs, significant_digits=8):
        """Convert an AddFuncDict implementing a tuple of weights to a linear
        combination of those weights.

        This only works when the weights computed by the AddFuncDict use plain
        `sum` as their accum_func.
        Otherwise, although the resulting AddFuncDict may be usable without errors,
        its behavior is undefined.

        Args:
            coeffs: The coefficients to be multiplied with each weight before summing.
            significant_digits: To combat floating point errors, only this many digits
                after the decimal will be significant in comparisons between weights.

        Returns:
            A new AddFuncDict object which computes the specified linear combination
            of weights.
        """
        n = len(self.names)
        if len(coeffs) != n:
            raise ValueError(
                f"Expected {n} ranking coefficients but received {len(coeffs)}."
            )
        if n == 1:
            raise ValueError(
                "linear_combination should only be called on AddFuncDict"
                " objects which compute more than one weight, e.g."
                " resulting from summing one or more AddFuncDicts."
            )

        def make_floatstate(val):
            return FloatState(round(val, significant_digits), state=val)

        def _lc(weight_tuple):
            return make_floatstate(sum(c * w for c, w in zip(coeffs, weight_tuple)))

        def accum_func(weights):
            return make_floatstate(sum(w.state for w in weights))

        start_func = self["start_func"]
        edge_func = self["edge_weight_func"]

        def new_start_func(n):
            return _lc(start_func(n))

        def new_edge_func(n1, n2):
            return _lc(edge_func(n1, n2))

        return AddFuncDict(
            {
                "start_func": new_start_func,
                "edge_weight_func": new_edge_func,
                "accum_func": accum_func,
            },
            name="("
            + " + ".join(
                str(c) + "(" + name + ")" for c, name in zip(coeffs, self.names)
            )
            + ")",
        )





[docs]
class HistoryDagFilter:
    """Container for :class:`historydag.utils.AddFuncDict` and an optimality
    function `optimal_func`

    Args:
        weight_func: An :class:`AddFuncDict` object
        optimal_func: A function that specifies how to choose the optimal result from `weight_func`. e.g. `min` or `max`
    """


[docs]
    def __init__(
        self,
        weight_funcs: AddFuncDict,
        optimal_func,
        ordering_name=None,
        eq_func=operator.eq,
    ):
        self.weight_funcs = weight_funcs
        self.optimal_func = optimal_func
        self.eq_func = eq_func
        end_idx = len(self.weight_funcs.names)
        if ordering_name is None:
            if optimal_func == min:
                self.ordering_names = (("minimum", end_idx),)
            elif optimal_func == max:
                self.ordering_names = (("maximum", end_idx),)
            else:
                self.ordering_names = (("optimal", end_idx),)
        else:
            self.ordering_names = ((ordering_name, end_idx),)


    def __str__(self) -> str:
        start_idx = 0
        descriptions = []
        for ordering_name, end_idx in self.ordering_names:
            these_names = self.weight_funcs.names[start_idx:end_idx]
            if len(these_names) > 1:
                descriptions.append(
                    f"{ordering_name} ({', '.join(str(it) for it in these_names)})"
                )
            else:
                descriptions.append(ordering_name + " " + these_names[0])
            start_idx = end_idx
        return "HistoryDagFilter[" + " then ".join(descriptions) + "]"

    def __getitem__(self, item):
        if item == "optimal_func":
            return self.optimal_func
        elif item == "eq_func":
            return self.eq_func
        else:
            return self.weight_funcs[item]

    # Or should it be &?
    def __add__(self, other):
        if not isinstance(other, HistoryDagFilter):
            raise TypeError(
                f"Can only add HistoryDagFilter to HistoryDagFilter, not f{type(other)}"
            )
        split_idx = len(self.weight_funcs.names)

        def new_optimal_func(weight_tuple_seq):
            weight_tuple_seq = tuple(weight_tuple_seq)
            first_optimal_val = self.optimal_func(
                t[:split_idx] for t in weight_tuple_seq
            )
            second_optimal_val = other.optimal_func(
                t[split_idx:]
                for t in weight_tuple_seq
                if self.eq_func(t[:split_idx], first_optimal_val)
            )
            return first_optimal_val + second_optimal_val

        if self.eq_func == operator.eq and other.eq_func == operator.eq:
            new_eq_func = operator.eq
        else:

            def new_eq_func(a, b):
                return self.eq_func(a[:split_idx], b[:split_idx]) and other.eq_func(
                    a[split_idx:], b[split_idx:]
                )

        ret = HistoryDagFilter(
            self.weight_funcs + other.weight_funcs,
            new_optimal_func,
            eq_func=new_eq_func,
        )
        ret.ordering_names = self.ordering_names + tuple(
            (name, idx + split_idx) for name, idx in other.ordering_names
        )
        return ret

    def keys(self):
        yield from self.weight_funcs.keys()
        yield from ("optimal_func", "eq_func")


    # def with_linear_combination_ordering(self, ranking_coeffs, eq_func=operator.eq):
    #     ranking_coeffs = tuple(ranking_coeffs)
    #     n = len(self.weight_funcs.names)
    #     if len(ranking_coeffs) != n:
    #         raise ValueError(f"Expected {n} ranking coefficients but received {len(ranking_coeffs)}.")

    #     def _lc(weight_tuple):
    #         return sum(c * w for c, w in zip(ranking_coeffs, weight_tuple))

    #     def new_optimal_func(weight_tuple_sequence):
    #         return min(weight_tuple_sequence, key=_lc)

    #     def new_eq_func(weight_tup1, weight_tup2):
    #         return eq_func(_lc(weight_tup1), _lc(weight_tup2))

    #     ret = HistoryDagFilter(self.weight_funcs, new_optimal_func, eq_func=new_eq_func)
    #     new_optimal_func_name = ("minimum ("
    #                              + '+'.join(str(c) + chr(97 + i) for i, c in enumerate(ranking_coeffs))
    #                              + ") for ("
    #                              + ','.join(chr(97 + i) for i in range(n))
    #                              + ") =")
    #     ret.ordering_names = ((new_optimal_func_name, n),)
    #     return ret


node_countfuncs = AddFuncDict(
    {
        "start_func": lambda n: 0,
        "edge_weight_func": lambda n1, n2: 1,
        "accum_func": sum,
    },
    name="NodeCount",
)
"""Provides functions to count the number of nodes in trees.

For use with :meth:`historydag.HistoryDag.weight_count`.
"""



[docs]
def natural_edge_probability(parent, child):
    """Return the downward-conditional edge probability of the edge from parent
    to child.

    This is defined as 1/n, where n is the number of edges descending
    from the same child clade of ``parent`` as this edge.
    """
    if parent.is_ua_node():
        return 1 / len(list(parent.children()))
    else:
        eset = parent.clades[child.clade_union()]
        return 1 / len(eset.targets)



log_natural_probability_funcs = AddFuncDict(
    {
        "start_func": lambda n: 0,
        "edge_weight_func": lambda n1, n2: log(natural_edge_probability(n1, n2)),
        "accum_func": sum,
    },
    name="LogNaturalProbability",
)
"""Provides functions to count the probabilities of histories in a DAG,
according to the natural distribution induced by the DAG topology."""


def _process_rf_one_sided_coefficients(one_sided, one_sided_coefficients):
    rf_type_suffix = "distance"
    RFType = IntState

    if one_sided is None:
        # Only then will one_sided_coefficients be considered
        if one_sided_coefficients != (1, 1):
            rf_type_suffix = "nonstandard"
            # As long as both coefficients are integers, RF distances will
            # be integers. Otherwise, we need to allow floats by using
            # FloatState objects.
            if not all(isinstance(it, int) for it in one_sided_coefficients):
                RFType = FloatState
    elif one_sided.lower() == "left":
        one_sided_coefficients = (1, 0)
        rf_type_suffix = "left_difference"
    elif one_sided.lower() == "right":
        one_sided_coefficients = (0, 1)
        rf_type_suffix = "right_difference"
    else:
        raise ValueError(
            f"Argument `one_sided` must have value 'left', 'right', or None, not {one_sided}"
        )

    s, t = one_sided_coefficients
    return s, t, rf_type_suffix, RFType



[docs]
def sum_rfdistance_funcs(
    reference_dag: "HistoryDag",
    rooted: bool = True,
    one_sided: str = None,
    one_sided_coefficients: Tuple[float, float] = (1, 1),
):
    """Provides functions to compute the sum over all histories in the provided
    reference DAG, of rooted RF distances to those histories.

    Args:
        reference_dag: The reference DAG. The sum will be computed over all RF
            distances to histories in this DAG
        rooted: If False, use edges' splits for RF distance computation. Otherwise, use
            the clade below each edge.
        one_sided: May be 'left', 'right', or None. 'left' means that we count
            splits (or clades, in the rooted case) which are in the reference trees but not
            in the DAG tree, especially useful if trees in the DAG might be resolutions of
            multifurcating trees in the reference DAG. 'right' means that we count splits or clades in
            the DAG tree which are not in the reference trees, useful if the reference trees
            are possibly resolutions of multifurcating trees in the DAG. If not None,
            one_sided_coefficients are ignored.
        one_sided_coefficients: coefficients for non-standard symmetric difference calculations
            (explained in notes below)

    The reference DAG must have the same taxa as all the trees in the DAG on which these count
    functions are used. If this is not true, methods using the keyword arguments produced by this
    function may fail silently, returning values which mean nothing.

    This function allows computation of sums of a Robinson-Foulds distance generalized by the
    coefficients ``(s, t)`` provided to the ``one_sided_coefficients`` argument (or implicitly
    set by the ``one_sided`` argument). Given a tree in the DAG with set of clades (or splits) A, and
    a tree in the reference DAG with set of clades B, this distance is given by:

    ``d_{s,t}(A, B) = s|B - A| + t|A - B|``

    Notice that when s and t are both 1, this is the symmetric difference of A and B, the standard RF
    distance.

    For each tree A in a DAG, the AddFuncDict returned by this function computes the sum of this distance
    over all trees B in the reference DAG.


    Note that when computing unrooted weights, the sums are over all rooted trees in the reference
    DAG, so a single unrooted tree contained twice in the reference DAG with different rootings
    will be counted twice.

    Weights are represented by an IntState object and are shifted by a constant K,
    which is the sum of number of clades in each tree in the DAG.
    """
    s, t, rf_type_suffix, RFType = _process_rf_one_sided_coefficients(
        one_sided, one_sided_coefficients
    )

    N = reference_dag.count_nodes(collapse=True, rooted=rooted)

    # K is the constant that the weights are shifted by
    K = s * sum(N.values())

    # We also scale num_trees by s...
    num_trees = t * reference_dag.count_histories()

    if rooted:

        def make_intstate(n):
            return RFType(n + K, state=n)

        def edge_func(n1, n2):
            clade = n2.clade_union()
            clade_count = N.get(clade, 0)
            weight = num_trees - ((s + t) * clade_count)
            return make_intstate(weight)

        kwargs = AddFuncDict(
            {
                "start_func": lambda n: make_intstate(0),
                "edge_weight_func": edge_func,
                "accum_func": lambda wlist: make_intstate(
                    sum(w.state for w in wlist)
                ),  # summation over edge weights
            },
            name="RF_rooted_sum_" + rf_type_suffix,
        )

    else:
        taxa = next(reference_dag.dagroot.children()).clade_union()
        n_taxa = len(taxa)

        def is_history_root(n):
            # TODO this is slow and dirty! Make more efficient
            return len(list(n.clade_union())) == n_taxa

        def split(node):
            cu = node.clade_union()
            return frozenset({cu, taxa - cu})

        # We accumulate tuples, where the first number contains the weight,
        # except any contribution of a split below a bifurcating root node
        # is contained in the second number. This way its contribution can be
        # added exactly once

        def make_intstate(tup):
            return RFType(tup[0] + tup[1] + K, state=tup)

        def summer(tupseq):
            tupseq = list(tupseq)
            a = 0
            for ia, _ in tupseq:
                a += ia
            # second value should only be counted once. Any nonzero
            # values of the second value will always be identical
            if len(tupseq) == 0:
                b = 0
            else:
                b = max(tupseq, key=lambda tup: abs(tup[1]))[1]
            return (a, b)

        def edge_func(n1, n2):
            spl = split(n2)
            spl_count = N.get(spl, 0)
            if n1.is_ua_node():
                return make_intstate((0, 0))
            else:
                val = num_trees - ((s + t) * spl_count)
                if len(n1.clades) == 2 and is_history_root(n1):
                    return make_intstate((0, val))
                else:
                    return make_intstate((val, 0))

        kwargs = AddFuncDict(
            {
                "start_func": lambda n: make_intstate((0, 0)),
                "edge_weight_func": edge_func,
                "accum_func": lambda wlist: make_intstate(
                    summer(w.state for w in wlist)
                ),  # summation over edge weights
            },
            name="RF_unrooted_sum_" + rf_type_suffix,
        )

    return kwargs




[docs]
def make_rfdistance_countfuncs(
    ref_tree: "HistoryDag",
    rooted: bool = False,
    one_sided: str = None,
    one_sided_coefficients: Tuple[float, float] = (1, 1),
):
    """Provides functions to compute Robinson-Foulds (RF) distances of trees in
    a DAG, relative to a fixed reference tree.

    We use :meth:`ete3.TreeNode.robinson_foulds` as the reference implementation for
    unrooted RF distance.

    Rooted Robinson-Foulds is simply the cardinality of the symmetric difference of
    the clade sets of two trees, including the root clade.
    Since we include the root clade in this calculation, our rooted RF distance does
    not match the rooted :meth:`ete3.TreeNode.robinson_foulds` implementation.

    Args:
        ref_tree: A tree with respect to which Robinson-Foulds distance will be computed.
        rooted: If False, use edges' splits for RF distance computation. Otherwise, use
            the clade below each edge.
        one_sided: May be 'left', 'right', or None. 'left' means that we count
            splits (or clades, in the rooted case) which are in the reference tree but not
            in the DAG tree, especially useful if trees in the DAG might be resolutions of
            a multifurcating reference. 'right' means that we count splits or clades in
            the DAG tree which are not in the reference tree, useful if the reference tree
            is possibly a resolution of multifurcating trees in the DAG. If not None,
            one_sided_coefficients are ignored.
        one_sided_coefficients: coefficients for non-standard symmetric difference calculations
            (explained in notes below)

    The reference tree must have the same taxa as all the trees in the DAG.

    This calculation relies on the observation that the symmetric distance between
    the splits (or clades, in the rooted case) A in a tree in the DAG, and the splits
    (or clades) B in the reference tree, can be computed as:

    ``|B ^ A| = |B - A| + |A - B| = |B| - |A n B| + |A - B|``

    As long as tree edges are in bijection with splits, this can be computed without
    constructing the set A by considering each edge's split independently.

    In order to accommodate multiple edges with the same split in a tree with root
    bifurcation, we keep track of the contribution of such edges separately.

    One-sided RF distances are computed in this framework by introducing a pair of
    ``one_sided_coefficients`` ``(s, t)``, which affect how much weight is given to
    the right and left differences in the RF distance calculation:

    ``d_{s,t}(A, B) = s|B - A| + t|A - B| = s(|B| - |A n B|) + t|A - B|``

    When both ``s`` and ``t`` are 1, we get the standard RF distance.
    When ``s=1`` and ``t=0``, then we have a one-sided "left" RF difference, counting
    the number of splits in the reference tree which are not in each DAG tree. When
    ``one_sided`` is set to `left`, then these coefficients will be used, regardless of
    the values passed.
    When ``s=0`` and ``t=1``, then we have a one-sided "right" RF difference, counting
    the number of splits in each DAG tree which are not in the reference. When
    ``one_sided`` is set to `right`, these coefficients will be used, regardless of
    the values passed.

    The weight type is a tuple wrapped in an IntState object. The first tuple value `a` is the
    contribution of edges which are not part of a root bifurcation, where edges whose splits are in B
    contribute `-1`, and edges whose splits are not in B contribute `1`, and the second tuple
    value `b` is the contribution of the edges which are part of a root bifurcation. The value
    of the IntState is computed as `a + sign(b) + |B|`, which on the UA node of the hDAG gives RF distance.
    """

    s, t, rf_type_suffix, RFType = _process_rf_one_sided_coefficients(
        one_sided, one_sided_coefficients
    )

    taxa = frozenset(n.label for n in ref_tree.get_leaves())

    if not rooted:

        def split(node):
            cu = node.clade_union()
            return frozenset({cu, taxa - cu})

        ref_splits = frozenset(split(node) for node in ref_tree.preorder())
        # Remove above-root split, which doesn't map to any tree edge:
        ref_splits = ref_splits - {
            frozenset({taxa, frozenset()}),
        }
        shift = s * len(ref_splits)

        n_taxa = len(taxa)

        def is_history_root(n):
            # TODO this is slow and dirty! Make more efficient
            return len(list(n.clade_union())) == n_taxa

        def sign(n):
            # Should return the value of a single term corresponding
            # to the identical root splits below a bifurcating root
            return (-s) * (n < 0) + t * (n > 0)

        def summer(tupseq):
            a, b = 0, 0
            for ia, ib in tupseq:
                a += ia
                b += ib
            return (a, b)

        def make_intstate(tup):
            return RFType(tup[0] + shift + sign(tup[1]), state=tup)

        def edge_func(n1, n2):
            spl = split(n2)
            if n1.is_ua_node():
                return make_intstate((0, 0))
            if len(n1.clades) == 2 and is_history_root(n1):
                if spl in ref_splits:
                    return make_intstate((0, -1))
                else:
                    return make_intstate((0, 1))
            else:
                if spl in ref_splits:
                    return make_intstate((-s, 0))
                else:
                    return make_intstate((t, 0))

        kwargs = AddFuncDict(
            {
                "start_func": lambda n: make_intstate((0, 0)),
                "edge_weight_func": edge_func,
                "accum_func": lambda wlist: make_intstate(
                    summer(w.state for w in wlist)
                ),
            },
            name="RF_unrooted_distance_" + rf_type_suffix,
        )
    else:
        ref_cus = frozenset(
            node.clade_union() for node in ref_tree.preorder(skip_ua_node=True)
        )

        shift = s * len(ref_cus)

        def make_intstate(n):
            return RFType(n + shift, state=n)

        def edge_func(n1, n2):
            if n2.clade_union() in ref_cus:
                inval = 1
            else:
                inval = 0
            return make_intstate(t - (s + t) * inval)

        kwargs = AddFuncDict(
            {
                "start_func": lambda n: make_intstate(0),
                "edge_weight_func": edge_func,
                "accum_func": lambda wlist: make_intstate(sum(w.state for w in wlist)),
            },
            name="RF_rooted_" + rf_type_suffix,
        )

    return kwargs




[docs]
def make_newickcountfuncs(
    name_func=lambda n: "unnamed",
    features=None,
    feature_funcs={},
    internal_labels=True,
    collapse_leaves=False,
):
    """Provides functions to count newick strings. For use with
    :meth:`historydag.HistoryDag.weight_count`.

    Arguments are the same as for
    :meth:`historydag.HistoryDag.to_newick`.
    """

    def _newicksum(newicks):
        # Filter out collapsed/deleted edges
        snewicks = sorted(newicks)
        if len(snewicks) == 2 and ";" in [newick[-1] for newick in snewicks if newick]:
            # Then we are adding an edge above a complete tree
            return "".join(
                sorted(snewicks, key=lambda n: ";" == n[-1] if n else False)
            )[:-1]
        else:
            # Then we're just accumulating options between clades
            return "(" + ",".join(snewicks) + ")"

    def _newickedgeweight(n1, n2):
        if collapse_leaves and n2.is_leaf() and n1.label == n2.label:
            return "COLLAPSED_LEAF;"
        elif (
            internal_labels
            or n2.is_leaf()
            or (collapse_leaves and frozenset({n2.label}) in n2.clades)
        ):
            return (
                n2._newick_label(
                    name_func=name_func, features=features, feature_funcs=feature_funcs
                )
                + ";"
            )
        else:
            return ";"

    return AddFuncDict(
        {
            "start_func": lambda n: "",
            "edge_weight_func": _newickedgeweight,
            "accum_func": _newicksum,
        },
        name="NewickString",
    )




[docs]
def edge_difference_funcs(reference_dag: "HistoryDag", key=lambda n: n):
    """Provides functions to compute the number of edges in a history which do
    not appear in a reference HistoryDag.

    This is useful for taking history-wise intersections of DAGs, or counting
    the number of histories which would appear in such an intersection.

    Args:
        reference_dag: The reference DAG. These functions will count the
            number of edges in a history which do not appear in this DAG.

    Returns:
        :class:`utils.AddFuncDict` object for use with HistoryDag methods for
        trimming and weight counting/annotation.
    """
    edge_set = set(
        (key(n), key(c)) for n in reference_dag.preorder() for c in n.children()
    )

    def edge_weight_func(n1, n2):
        return int((key(n1), key(n2)) not in edge_set)

    return AddFuncDict(
        {
            "start_func": lambda n: 0,
            "edge_weight_func": edge_weight_func,
            "accum_func": sum,
        },
        name="EdgeDifference",
    )



def _history_method(method):
    """HistoryDagNode method decorator to ensure that the method is only run on
    history DAGs which are histories."""

    @wraps(method)
    def wrapper(self, *args, **kwargs):
        if not self.is_history():
            raise ValueError(
                "to_newick requires the history DAG to be a history. "
                "To extract newicks from a general DAG, see to_newicks"
            )
        else:
            return method(self, *args, **kwargs)

    return wrapper



[docs]
def prod(ls: list):
    """Return product of elements of the input list.

    if passed list is empty, returns 1.
    """
    n = len(ls)
    if n > 0:
        accum = ls[0]
        if n > 1:
            for item in ls[1:]:
                accum *= item
    else:
        accum = 1
    return accum




[docs]
def logsumexp(ls: List[float]):
    """A numerically stable implementation of logsumexp, similar to Scipy's."""
    if len(ls) == 1:
        return ls[0]
    max_log = max(ls)
    if not isfinite(max_log):
        max_log = 0

    exponentiated = [exp(a - max_log) for a in ls]
    shifted_log_sum = log(sum(exponentiated))
    return shifted_log_sum + max_log



# Unfortunately these can't be made with a class factory (just a bit too meta for Python)
# short of doing something awful like https://hg.python.org/cpython/file/b14308524cff/Lib/collections/__init__.py#l232
def _remstate(kwargs):
    if "state" not in kwargs:
        kwargs["state"] = None
    intkwargs = kwargs.copy()
    intkwargs.pop("state")
    return intkwargs



[docs]
class IntState(int):
    """A subclass of int, with arbitrary, mutable state.

    State is provided to the constructor as the keyword argument
    ``state``. All other arguments will be passed to ``int``
    constructor. Instances should be functionally indistinguishable from
    ``int``.
    """

    def __new__(cls, *args, **kwargs):
        intkwargs = _remstate(kwargs)
        return super(IntState, cls).__new__(cls, *args, **intkwargs)


[docs]
    def __init__(self, *args, **kwargs):
        self.state = kwargs["state"]


    def __copy__(self):
        return IntState(int(self), state=self.state)

    def __getstate__(self):
        return {"val": int(self), "state": self.state}

    def __setstate__(self, statedict):
        self.state = statedict["state"]




[docs]
class FloatState(float):
    """A subclass of float, with arbitrary, mutable state.

    State is provided to the constructor as the keyword argument
    ``state``. All other arguments will be passed to ``float``
    constructor. Instances should be functionally indistinguishable from
    ``float``.
    """

    def __new__(cls, *args, **kwargs):
        intkwargs = _remstate(kwargs)
        return super(FloatState, cls).__new__(cls, *args, **intkwargs)


[docs]
    def __init__(self, *args, **kwargs):
        self.state = kwargs["state"]


    def __copy__(self):
        return FloatState(float(self), state=self.state)

    def __getstate__(self):
        return {"val": float(self), "state": self.state}

    def __setstate__(self, statedict):
        self.state = statedict["state"]




[docs]
class DecimalState(Decimal):
    """A subclass of ``decimal.Decimal``, with arbitrary, mutable state.

    State is provided to the constructor as the keyword argument
    ``state``. All other arguments will be passed to ``Decimal``
    constructor. Instances should be functionally indistinguishable from
    ``Decimal``.
    """

    def __new__(cls, *args, **kwargs):
        intkwargs = _remstate(kwargs)
        return super(DecimalState, cls).__new__(cls, *args, **intkwargs)


[docs]
    def __init__(self, *args, **kwargs):
        self.state = kwargs["state"]


    def __copy__(self):
        return DecimalState(Decimal(self), state=self.state)

    def __getstate__(self):
        return {"val": Decimal(self), "state": self.state}

    def __setstate__(self, statedict):
        self.state = statedict["state"]




[docs]
class StrState(str):
    """A subclass of string, with arbitrary, mutable state.

    State is provided to the constructor as the keyword argument
    ``state``. All other arguments will be passed to ``str``
    constructor. Instances should be functionally indistinguishable from
    ``str``.
    """

    def __new__(cls, *args, **kwargs):
        intkwargs = _remstate(kwargs)
        return super(StrState, cls).__new__(cls, *args, **intkwargs)


[docs]
    def __init__(self, *args, **kwargs):
        self.state = kwargs["state"]


    def __copy__(self):
        return StrState(str(self), state=self.state)

    def __getstate__(self):
        return {"val": str(self), "state": self.state}

    def __setstate__(self, statedict):
        self.state = statedict["state"]




[docs]
def count_labeled_binary_topologies(n):
    """Returns the number of binary topologies on n labeled leaves.

    In these topologies, left and right branches are not distinguished,
    and internal nodes are not ranked.
    """
    return prod(range(1, 2 * n - 2, 2))




[docs]
def powerset(iterable, start_size=0, end_size=None):
    """Produce all subsets of iterable (as tuples of elements), with sizes
    starting at start_size and ending at end_size (inclusive), or the size of
    the passed iterable if end_size is None."""
    items = list(iterable)
    if end_size is None:
        end_size = len(items)
    return chain.from_iterable(
        combinations(items, r) for r in range(start_size, end_size + 1)
    )




[docs]
def binary_support(clade_size, total_leaves, normalized=True):
    """Calculate the fraction of binary trees on total_leaves containing a
    particular clade containing clade_size leaves.

    If normalized is False, instead returns the number of binary
    topologies which would contain a particular clade of size
    clade_size.
    """
    if clade_size > total_leaves:
        raise ValueError("Clade size cannot exceed total number of leaves in tree")

    count = count_labeled_binary_topologies(
        clade_size
    ) * count_labeled_binary_topologies(total_leaves - clade_size + 1)
    # This could certainly be more numerically stable
    if normalized:
        return count / count_labeled_binary_topologies(total_leaves)
    else:
        return count




[docs]
def count_resolved_clade_supports(
    n_child_clades, threshold=-1, min_size=1, max_size=None
):
    """Returns a generator on clade size, support pairs, for clades which would
    result from binary resolution of a node posessing child clade sets in
    node_child_clades.

    Clade size means number of children of this node which are grouped
    below a node.

    Summing over the first element of yielded tuples gives the number of
    elements which would be yielded by :meth:`iter_resolved_clade_supports`
    provided with n_child_clades child clades and the same threshold
    value.

    Args:
        n_child_clades: The number of children of the multifurcating node
        threshold: If a resolved node's clade support value is below this threshold,
            that clade will not be counted.
        min_size: The minimum size of a clade to be counted.
        max_size: The (inclusive) maximum size of a clade to be counted.
            The maximum value is ``len(node_child_clades)``, which is equivalent
            to the default value.

    Note that by default, the root clade, including all leaves contained in
    node_child_clades, as well as all the clades contained in
    node_child_clades, are included and each have a support of 1. To exclude leaves, pass
    ``min_size=2``.
    """
    num_children = n_child_clades
    if max_size is None:
        max_size = num_children
    elif max_size > num_children:
        raise ValueError("max_size cannot be greater than n_child_clades")
    elif max_size < 1:
        raise ValueError("max_size cannot be less than 1")
    for unflattened_clade_size in range(min_size, max_size + 1):
        # support will be the same for all clades of this size...
        support = binary_support(unflattened_clade_size, num_children)
        # ... so this check need only be done num_children times
        if support > threshold:
            yield (comb(num_children, unflattened_clade_size), support)




[docs]
def iter_resolved_clade_supports(
    node_child_clades, threshold=-1, min_size=1, max_size=None
):
    """Returns a generator on clade, support pairs, for clades which would
    result from binary resolution of a node posessing child clade sets in
    node_child_clades. All clades with support > threshold are yielded,
    avoiding iteration through too many clades on large multifurcations.

    Args:
        node_child_clades: A set of frozensets containing the clades of the children of a
            multifurcating node.
        threshold: If a resolved node's clade support value is below this threshold,
            that clade will not be yielded.
        min_size: The minimum size of a clade to be yielded. Note this is NOT simply the
            size of the clade set, but rather the number of children of the multifurcating
            node which are below the resolved node corresponding to the clade.
        max_size: The (inclusive) maximum size of a clade to be yielded. See `min_size`
            for a description of what size means. The maximum value is ``len(node_child_clades)``,
            which is equivalent to the default value.

    Note that by default, the root clade, including all leaves contained in
    node_child_clades, as well as all the clades contained in
    node_child_clades, are included and each have a support of 1. To exclude leaves, pass
    ``min_size=2``.
    """
    if max_size is None:
        max_size = len(node_child_clades)
    elif max_size > len(node_child_clades):
        raise ValueError("max_size cannot be greater than the number of child clades")
    elif max_size < 1:
        raise ValueError("max_size cannot be less than 1")

    num_children = len(node_child_clades)
    for unflattened_clade_size in range(min_size, max_size + 1):
        # support will be the same for all clades of this size...
        support = binary_support(unflattened_clade_size, num_children)
        # ... so this check need only be done num_children times
        if support > threshold:
            for clade in map(
                lambda ns: frozenset(chain.from_iterable(ns)),
                powerset(
                    node_child_clades,
                    start_size=unflattened_clade_size,
                    end_size=unflattened_clade_size,
                ),
            ):
                yield (clade, support)




[docs]
def read_fasta(fastapath, sequence_type=str):
    """Load a fasta file as a generator which yields (sequence ID, sequence)
    pairs.

    The function ``sequence_type`` will be called on each sequence as it
    is read from the fasta file, and the resulting object will be yielded as the second
    item in each sequence record pair.
    """
    seqids = set()
    with open(fastapath, "r") as fh:
        seqid = None
        sequence = ""
        for line in fh:
            if line[0] == ">":
                if seqid is not None:
                    yield (seqid, sequence_type(sequence))
                    seqids.add(seqid)
                seqid = line[1:].strip()
                sequence = ""
                if seqid in seqids:
                    raise ValueError(
                        "Duplicate records with matching identifier in fasta file"
                    )
            else:
                if seqid is None and line.strip():
                    raise ValueError(
                        "First non-blank line in fasta does not contain identifier"
                    )
                else:
                    sequence += line.strip().upper()
        yield (seqid, sequence_type(sequence))




[docs]
def load_fasta(fastapath, sequence_type=str):
    """Load a fasta file as a dictionary, with sequence ids as keys and
    sequences as values.

    The function ``sequence_type`` will be called on each sequence as it
    is read from the fasta file, and the returned objects will be the values
    in the resulting alignment dictionary.
    """
    return dict(read_fasta(fastapath, sequence_type=sequence_type))



def _deprecate_message(message):
    def _deprecate(func):
        @wraps(func)
        def deprecated(*args, **kwargs):
            warn(message)
            return func(*args, **kwargs)

        return deprecated

    return _deprecate