Source code for trilearn.graph.trajectory

"""
A class for handling Markov chains produced from e.g. MCMC.
"""
import json

from networkx.readwrite import json_graph
import pandas as pd
import numpy as np
import networkx as nx
import trilearn.graph.empirical_graph_distribution as gdist
from trilearn.graph import graph as glib
from trilearn.distributions import sequential_junction_tree_distributions as sd



[docs]
class Trajectory:
    """
    Class for handling trajectories of decomposable graphical models.
    """
    def __init__(self):
        self.trajectory = []
        self.time = []
        self.seqdist = None
        self.burnin = 0
        self.logl = []
        self._size = []


[docs]
    def set_sampling_method(self, method):
        self.sampling_method = method



[docs]
    def set_sequential_distribution(self, seqdist):
        """ Set the SequentialJTDistribution for the graphs in the trajectory

        Args:
            seqdist (SequentialJTDistribution): A sequential distribution
        """
        self.seqdist = seqdist



[docs]
    def set_trajectory(self, trajectory):
        """ Set the trajectory of graphs.

        Args:
            trajectory (Trajectory): An MCMC trajectory of graphs.
        """
        self.trajectory = trajectory



[docs]
    def set_time(self, generation_time):
        self.time = generation_time



[docs]
    def add_sample(self, graph, time, logl=None):
        """ Add graph to the trajectory.

        Args:
            graph (NetworkX graph):
            time (list): List of times it took to generate each sample
        """
        self.trajectory.append(graph)
        self.time.append(time)
        if logl is not None:
            self.logl.append(logl)



[docs]
    def empirical_distribution(self, from_index=0):
        length = len(self.trajectory) - from_index
        graph_dist = gdist.GraphDistribution()
        for g in self.trajectory[from_index:]:
            graph_dist.add_graph(g, 1./length)
        return graph_dist



[docs]
    def log_likelihood(self, from_index=0):
        if self.logl == []:
            self.logl = [self.seqdist.log_likelihood(g) for g in self.trajectory]
        return pd.Series(self.logl[from_index:])



[docs]
    def maximum_likelihood_graph(self):
        ml_ind = self.log_likelihood().idxmax()
        return self.trajectory[ml_ind]



[docs]
    def size(self, from_index=0):
        """ Plots the auto-correlation function of the graph size (number of edges)
        Args:
            from_index (int): Burn-in period, default=0.
        """
        if self._size == []:
            self._size = [g.size() for g in self.trajectory[from_index:]]
        return pd.Series(self._size)



[docs]
    def write_file(self, filename=None, optional={}):
        """ Writes a Trajectory together with the corresponding
        sequential distribution to a json-file.
        """

        def default(o):
            if isinstance(o, np.int64): return int(o)
            raise TypeError

        if filename is None:
            with open(str(self) + ".json", 'w') as outfile:
                json.dump(self.to_json(optional=optional), outfile, default=default)
        else:
            with open(filename, 'w') as outfile:
                json.dump(self.to_json(optional=optional), outfile, default=default)



[docs]
    def get_adjvec_trajectory(self):
        mats = []
        for graph in self.trajectory:
            m = nx.to_numpy_array(graph, dtype=int)
            mats.append(m.flatten().tolist())
        return mats



[docs]
    def graph_diff_trajectory_df(self, labels):

        def list_to_string(edge_list):            
            s = "["
            for i, e in enumerate(edge_list):  
                s += str(labels[e[0]]) + "-" + str(labels[e[1]]) 
                if i!= len(edge_list)-1:
                    s +=";"
            return s + "]"
            
        added = [] 
        removed = []
        
        for i in range(1, self.trajectory[0].order()):
            added += [(0, i)]
        
        df = pd.DataFrame({"index": [-2],
                            "added" : [list_to_string(added)],
                            "removed" : [list_to_string([])],
                            "score" : [0]})

        

        df2 = pd.DataFrame({"index": [-1],
                            "added" : [list_to_string([])],
                            "removed" : [list_to_string(added)],
                            "score" : [0]})

        df = pd.concat([df, df2], ignore_index=False)

        added = list(self.trajectory[0].edges)
        removed = []

        df2 = pd.DataFrame({"index": [0],
                            "added" : [list_to_string(added)],
                            "removed" : [list_to_string([])],
                            "score" : [ self.log_likelihood()[0]]})
    
        df = pd.concat([df, df2], ignore_index=False)

        for i in range(1, len(self.trajectory[1:-1])):
            g_cur = self.trajectory[i]
            g_prev = self.trajectory[i-1]

            if glib.hash_graph(g_cur) != glib.hash_graph(g_prev):
                
                # To avoid duplicates like (1,2) and (2,1)
                cur_edges = {frozenset(e) for e in g_cur.edges}
                prev_edges = {frozenset(e) for e in g_prev.edges}

                added = [tuple(e) for e in cur_edges - prev_edges]
                removed = [tuple(e) for e in prev_edges - cur_edges]
                
                df2 = pd.DataFrame({"index": [i],
                                    "added" : [list_to_string(added)],
                                    "removed" : [list_to_string(removed)],
                                    "score" : [self.log_likelihood()[i]]})
                
                df = pd.concat([df, df2], ignore_index=False)
        return df



[docs]
    def write_adjvec_trajectory(self, filename):
        """ Writes the trajectory of adjacency matrices to file.
        """
        mats = self.get_adjvec_trajectory()
        with open(filename, 'w') as outfile:
                json.dump(mats, outfile)



[docs]
    def to_json(self, optional={}):
        js_graphs = [json_graph.node_link_data(graph) for
                     graph in self.trajectory]

        mcmc_traj = {"model": self.seqdist.get_json_model(),
                     "run_time": self.time,
                     "optional": optional,
                     "sampling_method": self.sampling_method,
                     "trajectory": js_graphs
                     }
        return mcmc_traj




[docs]
    def from_json(self, mcmc_json):

        graphs = [json_graph.node_link_graph(js_graph)
                  for js_graph in mcmc_json["trajectory"]]

        self.set_trajectory(graphs)
        self.set_time(mcmc_json["run_time"])
        self.optional = mcmc_json["optional"]
        self.sampling_method = mcmc_json["sampling_method"]
        if mcmc_json["model"]["name"] == "ggm_jt_post":
            self.seqdist = sd.GGMJTPosterior()
        elif mcmc_json["model"]["name"] == "loglin_jt_post":
            self.seqdist = sd.LogLinearJTPosterior()

        self.seqdist.init_model_from_json(mcmc_json["model"])



[docs]
    def read_file(self, filename):
        """ Reads a trajectory from json-file.
        """
        with open(filename) as mcmc_file:
            mcmc_json = json.load(mcmc_file)

        self.from_json(mcmc_json)


    def __str__(self):
        if self.sampling_method["method"] == "pgibbs":
            return "pgibbs_graph_trajectory_" + str(self.seqdist) + "_length_" + str(len(self.trajectory)) + \
            "_N_" + str(self.sampling_method["params"]["N"]) + \
            "_alpha_" + str(self.sampling_method["params"]["alpha"]) + \
            "_beta_" + str(self.sampling_method["params"]["beta"]) + \
            "_radius_" + str(self.sampling_method["params"]["radius"])
        elif self.sampling_method["method"] == "mh":
            return "mh_graph_trajectory_" + str(self.seqdist) + "_length_" + str(len(self.trajectory)) + \
                "_randomize_interval_" + str(self.sampling_method["params"]["randomize_interval"])