doc/ref/pmi_2Analysis_8py_source.html

 #!/usr/bin/env python


 """@namespace IMP.pmi.analysis

    Tools for clustering and cluster analysis

 """

 import IMP

 import IMP.algebra

 import IMP.em

 import IMP.pmi

 import IMP.pmi.tools

 import IMP.pmi.output

 import IMP.rmf

 import RMF

 import IMP.pmi.analysis

 from operator import itemgetter

 from math import log, sqrt

 import itertools

 import numpy as np


 class Alignment:

     """Performs alignment and RMSD calculation for two sets of coordinates


     The class also takes into account non-equal stoichiometry of the proteins.

     If this is the case, the protein names of proteins in multiple copies

     should be specified in the following form:

     nameA..1, nameA..2 (note two dots).

     """


     def __init__(self, template, query, weights=None):

         """Constructor.

            @param query {'p1':coords(L,3), 'p2':coords(L,3)}

            @param template {'p1':coords(L,3), 'p2':coords(L,3)}

            @param weights optional weights for each set of coordinates

         """

         self.query = query

         self.template = template

         self.weights = weights


         if len(self.query.keys()) != len(self.template.keys()):

             raise ValueError('''the number of proteins

                                in template and query does not match!''')


     def permute(self):

         # get unique protein names for each protein

         # this is, unfortunately, expecting that names are all 'Molname..X'

         #  where X is different for each copy.

         self.proteins = sorted(self.query.keys())

         prots_uniq = [i.split('..')[0] for i in self.proteins]


         # for each unique name, store list of permutations

         # e.g. for keys A..1,A..2 store P[A] = [[A..1,A..2],[A..2,A..1]]

         # then store the product: [[[A1,A2],[B1,B2]],[[A1,A2],[B2,B1]],

         #                          [[A2,A1],[B1,B2]],[[A2,A1],[B2,B1]]]

         P = {}

         for p in prots_uniq:

             copies = [i for i in self.proteins if i.split('..')[0] == p]

             prmts = list(itertools.permutations(copies, len(copies)))

             P[p] = prmts

         self.P = P

         self.Product = list(itertools.product(*P.values()))


     def get_rmsd(self):


         self.permute()


         template_xyz = []

         weights = []

         torder = sum([list(i) for i in self.Product[0]], [])

         for t in torder:

             template_xyz += [IMP.algebra.Vector3D(i) for i in self.template[t]]

             if self.weights is not None:

                 weights += [i for i in self.weights[t]]


         self.rmsd = 10000000000.

         for comb in self.Product:


             order = sum([list(i) for i in comb], [])

             query_xyz = []

             for p in order:

                 query_xyz += [IMP.algebra.Vector3D(i) for i in self.query[p]]


             if self.weights is not None:

                 dist = IMP.algebra.get_weighted_rmsd(

                     template_xyz, query_xyz, weights)

             else:

                 dist = IMP.algebra.get_rmsd(template_xyz, query_xyz)

             if dist < self.rmsd:

                 self.rmsd = dist

         return self.rmsd


     def align(self):

         from scipy.spatial.distance import cdist


         self.permute()


         # create flat coordinate list from template in standard order

         # then loop through the permutations and try to align and get RMSD

         # should allow you to try all mappings within each protein

         template_xyz = []

         torder = sum([list(i) for i in self.Product[0]], [])

         for t in torder:

             template_xyz += [IMP.algebra.Vector3D(i) for i in self.template[t]]

         # template_xyz = np.array(template_xyz)

         self.rmsd, Transformation = 10000000000., ''


         # then for each permutation, get flat list of coords and get RMSD

         for comb in self.Product:

             order = sum([list(i) for i in comb], [])

             query_xyz = []

             for p in order:

                 query_xyz += [IMP.algebra.Vector3D(i) for i in self.query[p]]


             if len(template_xyz) != len(query_xyz):

                 raise ValueError('''the number of coordinates

                                in template and query does not match!''')


             transformation = \

                 IMP.algebra.get_transformation_aligning_first_to_second(

                     query_xyz, template_xyz)

             query_xyz_tr = [transformation.get_transformed(n)

                             for n in query_xyz]


             dist = sqrt(

                 sum(np.diagonal(cdist(template_xyz, query_xyz_tr) ** 2))

                 / len(template_xyz))

             if dist < self.rmsd:

                 self.rmsd = dist

                 Transformation = transformation


         # return the transformation

         return (self.rmsd, Transformation)


 # TEST for the alignment ###

 """

 Proteins = {'a..1':np.array([np.array([-1.,1.])]),

             'a..2':np.array([np.array([1.,1.,])]),

             'a..3':np.array([np.array([-2.,1.])]),

             'b':np.array([np.array([0.,-1.])]),

             'c..1':np.array([np.array([-1.,-1.])]),

             'c..2':np.array([np.array([1.,-1.])]),

             'd':np.array([np.array([0.,0.])]),

             'e':np.array([np.array([0.,1.])])}


 Ali = Alignment(Proteins, Proteins)

 Ali.permute()

 if Ali.get_rmsd() == 0.0: print 'successful test!'

 else: print 'ERROR!'; exit()

 """


 # ----------------------------------

 class Violations:


     def __init__(self, filename):


         self.violation_thresholds = {}

         self.violation_counts = {}


         data = open(filename)

         D = data.readlines()

         data.close()


         for d in D:

             d = d.strip().split()

             self.violation_thresholds[d[0]] = float(d[1])


     def get_number_violated_restraints(self, rsts_dict):

         num_violated = 0

         for rst in self.violation_thresholds:

             if rst not in rsts_dict:

                 continue  # print rst;

             if float(rsts_dict[rst]) > self.violation_thresholds[rst]:

                 num_violated += 1

                 if rst not in self.violation_counts:

                     self.violation_counts[rst] = 1

                 else:

                     self.violation_counts[rst] += 1

         return num_violated


 class Clustering:

     """A class to cluster structures.

     Uses scipy's cdist function to compute distance matrices

     and sklearn's kmeans clustering module.

     """

     def __init__(self, rmsd_weights=None):

         """Constructor.

            @param rmsd_weights Flat list of weights for each particle

                                (if they're coarse)

         """

         try:

             from mpi4py import MPI

             self.comm = MPI.COMM_WORLD

             self.rank = self.comm.Get_rank()

             self.number_of_processes = self.comm.size

         except ImportError:

             self.number_of_processes = 1

             self.rank = 0

         self.all_coords = {}

         self.structure_cluster_ids = None

         self.tmpl_coords = None

         self.rmsd_weights = rmsd_weights


     def set_template(self, part_coords):

         self.tmpl_coords = part_coords


     def fill(self, frame, Coords):

         """Add coordinates for a single model."""

         self.all_coords[frame] = Coords


     def dist_matrix(self):

         self.model_list_names = list(self.all_coords.keys())

         self.model_indexes = list(range(len(self.model_list_names)))

         self.model_indexes_dict = dict(

             list(zip(self.model_list_names, self.model_indexes)))

         model_indexes_unique_pairs = \

             list(itertools.combinations(self.model_indexes, 2))


         my_model_indexes_unique_pairs = IMP.pmi.tools.chunk_list_into_segments(

             model_indexes_unique_pairs,

             self.number_of_processes)[self.rank]


         print("process %s assigned with %s pairs"

               % (str(self.rank), str(len(my_model_indexes_unique_pairs))))


         (raw_distance_dict, self.transformation_distance_dict) = \

             self.matrix_calculation(self.all_coords, self.tmpl_coords,

                                     my_model_indexes_unique_pairs)


         if self.number_of_processes > 1:

             raw_distance_dict = IMP.pmi.tools.scatter_and_gather(

                 raw_distance_dict)

             pickable_transformations = \

                 self.get_pickable_transformation_distance_dict()

             pickable_transformations = IMP.pmi.tools.scatter_and_gather(

                 pickable_transformations)

             self.set_transformation_distance_dict_from_pickable(

                 pickable_transformations)


         self.raw_distance_matrix = np.zeros(

             (len(self.model_list_names), len(self.model_list_names)))

         for item in raw_distance_dict:

             (f1, f2) = item

             self.raw_distance_matrix[f1, f2] = raw_distance_dict[item]

             self.raw_distance_matrix[f2, f1] = raw_distance_dict[item]


     def get_dist_matrix(self):

         return self.raw_distance_matrix


     def do_cluster(self, number_of_clusters, seed=None):

         """Run K-means clustering

         @param number_of_clusters Num means

         @param seed the random seed

         """

         from sklearn.cluster import KMeans

         if seed is not None:

             np.random.seed(seed)

         try:

             # check whether we have the right version of sklearn

             kmeans = KMeans(n_clusters=number_of_clusters)

         except TypeError:

             # sklearn older than 0.12

             kmeans = KMeans(k=number_of_clusters)

         kmeans.fit_predict(self.raw_distance_matrix)


         self.structure_cluster_ids = kmeans.labels_


     def get_pickable_transformation_distance_dict(self):

         pickable_transformations = {}

         for label in self.transformation_distance_dict:

             tr = self.transformation_distance_dict[label]

             trans = tuple(tr.get_translation())

             rot = tuple(tr.get_rotation().get_quaternion())

             pickable_transformations[label] = (rot, trans)

         return pickable_transformations


     def set_transformation_distance_dict_from_pickable(

         self,

             pickable_transformations):

         self.transformation_distance_dict = {}

         for label in pickable_transformations:

             tr = pickable_transformations[label]

             trans = IMP.algebra.Vector3D(tr[1])

             rot = IMP.algebra.Rotation3D(tr[0])

             self.transformation_distance_dict[

                 label] = IMP.algebra.Transformation3D(rot, trans)


     def save_distance_matrix_file(self, file_name='cluster.rawmatrix.pkl'):

         import pickle

         outf = open(file_name + ".data", 'wb')


         # to pickle the transformation dictionary

         # you have to save the arrays correposnding to

         # the transformations


         pickable_transformations = \

             self.get_pickable_transformation_distance_dict()

         pickle.dump(

             (self.structure_cluster_ids,

              self.model_list_names,

              pickable_transformations),

             outf)


         np.save(file_name + ".npy", self.raw_distance_matrix)


     def load_distance_matrix_file(self, file_name='cluster.rawmatrix.pkl'):

         import pickle


         inputf = open(file_name + ".data", 'rb')

         (self.structure_cluster_ids, self.model_list_names,

          pickable_transformations) = pickle.load(inputf)

         inputf.close()


         self.raw_distance_matrix = np.load(file_name + ".npy")


         self.set_transformation_distance_dict_from_pickable(

             pickable_transformations)

         self.model_indexes = list(range(len(self.model_list_names)))

         self.model_indexes_dict = dict(

             list(zip(self.model_list_names, self.model_indexes)))


     def plot_matrix(self, figurename="clustermatrix.pdf"):

         import matplotlib as mpl

         mpl.use('Agg')

         import matplotlib.pylab as pl

         from scipy.cluster import hierarchy as hrc


         fig = pl.figure(figsize=(10, 8))

         ax = fig.add_subplot(212)

         dendrogram = hrc.dendrogram(

             hrc.linkage(self.raw_distance_matrix),

             color_threshold=7,

             no_labels=True)

         leaves_order = dendrogram['leaves']

         ax.set_xlabel('Model')

         ax.set_ylabel('RMSD [Angstroms]')


         ax2 = fig.add_subplot(221)

         cax = ax2.imshow(

             self.raw_distance_matrix[leaves_order,

                                      :][:,

                                         leaves_order],

             interpolation='nearest')

         cb = fig.colorbar(cax)

         cb.set_label('RMSD [Angstroms]')

         ax2.set_xlabel('Model')

         ax2.set_ylabel('Model')


         pl.savefig(figurename, dpi=300)

         pl.close(fig)


     def get_model_index_from_name(self, name):

         return self.model_indexes_dict[name]


     def get_cluster_labels(self):

         # this list

         return list(set(self.structure_cluster_ids))


     def get_number_of_clusters(self):

         return len(self.get_cluster_labels())


     def get_cluster_label_indexes(self, label):

         return (

             [i for i, l in enumerate(self.structure_cluster_ids) if l == label]

         )


     def get_cluster_label_names(self, label):

         return (

             [self.model_list_names[i]

                 for i in self.get_cluster_label_indexes(label)]

         )


     def get_cluster_label_average_rmsd(self, label):


         indexes = self.get_cluster_label_indexes(label)


         if len(indexes) > 1:

             sub_distance_matrix = self.raw_distance_matrix[

                 indexes, :][:, indexes]

             average_rmsd = np.sum(sub_distance_matrix) / \

                 (len(sub_distance_matrix)

                  ** 2 - len(sub_distance_matrix))

         else:

             average_rmsd = 0.0

         return average_rmsd


     def get_cluster_label_size(self, label):

         return len(self.get_cluster_label_indexes(label))


     def get_transformation_to_first_member(

         self,

         cluster_label,

             structure_index):

         reference = self.get_cluster_label_indexes(cluster_label)[0]

         return self.transformation_distance_dict[(reference, structure_index)]


     def matrix_calculation(self, all_coords, template_coords, list_of_pairs):


         model_list_names = list(all_coords.keys())

         rmsd_protein_names = list(all_coords[model_list_names[0]].keys())

         raw_distance_dict = {}

         transformation_distance_dict = {}

         if template_coords is None:

             do_alignment = False

         else:

             do_alignment = True

             alignment_template_protein_names = list(template_coords.keys())


         for (f1, f2) in list_of_pairs:


             if not do_alignment:

                 # here we only get the rmsd,

                 # we need that for instance when you want to cluster

                 # conformations globally, eg the EM map is a reference

                 transformation = IMP.algebra.get_identity_transformation_3d()


                 coords_f1 = dict([(pr, all_coords[model_list_names[f1]][pr])

                                  for pr in rmsd_protein_names])

                 coords_f2 = {}

                 for pr in rmsd_protein_names:

                     coords_f2[pr] = all_coords[model_list_names[f2]][pr]


                 Ali = Alignment(coords_f1, coords_f2, self.rmsd_weights)

                 rmsd = Ali.get_rmsd()


             elif do_alignment:

                 # here we actually align the conformations first

                 # and than calculate the rmsd. We need that when the

                 # protein(s) is the reference

                 coords_f1 = dict([(pr, all_coords[model_list_names[f1]][pr])

                                  for pr in alignment_template_protein_names])

                 coords_f2 = dict([(pr, all_coords[model_list_names[f2]][pr])

                                  for pr in alignment_template_protein_names])


                 Ali = Alignment(coords_f1, coords_f2)

                 template_rmsd, transformation = Ali.align()


                 # here we calculate the rmsd

                 # we will align two models based n the number of subunits

                 # provided and transform coordinates of model 2 to model 1

                 coords_f1 = dict([(pr, all_coords[model_list_names[f1]][pr])

                                  for pr in rmsd_protein_names])

                 coords_f2 = {}

                 for pr in rmsd_protein_names:

                     coords_f2[pr] = [transformation.get_transformed(

                         i) for i in all_coords[model_list_names[f2]][pr]]


                 Ali = Alignment(coords_f1, coords_f2, self.rmsd_weights)

                 rmsd = Ali.get_rmsd()


             raw_distance_dict[(f1, f2)] = rmsd

             raw_distance_dict[(f2, f1)] = rmsd

             transformation_distance_dict[(f1, f2)] = transformation

             transformation_distance_dict[(f2, f1)] = transformation


         return raw_distance_dict, transformation_distance_dict


 class RMSD:

     """Compute the RMSD (without alignment) taking into account the copy

        ambiguity.

     To be used with pmi2 hierarchies. Can be used for instance as follows:


     rmsd = IMP.pmi.analysis.RMSD(

         hier, hier, [mol.get_name() for mol in mols],

         dynamic0=True, dynamic1=False)

     output_objects.append(rmsd)


     before shuffling the coordinates

     """


     def __init__(self, hier0, hier1, molnames, label="None", dynamic0=True,

                  dynamic1=True, metric=IMP.algebra.get_rmsd):

         """

         @param hier0 first input hierarchy

         @param hier1 second input hierarchy

         @param molnames the names of the molecules used for the RMSD

         @dynamic0 if True stores the decorators XYZ and coordinates of

                   hier0 can be update. If false coordinates are static

                   (stored in Vector3Ds) and will never be updated

         @dynamic1 same as above

         metric what metric should be used

         """

         self.moldict0, self.molcoords0, self.mol_XYZs0 = \

             self.get_moldict_coorddict(hier0, molnames)

         self.moldict1, self.molcoords1, self.mol_XYZs1 = \

             self.get_moldict_coorddict(hier1, molnames)

         self.dynamic0 = dynamic0

         self.dynamic1 = dynamic1

         self.metric = metric

         self.label = label


     def get_moldict_coorddict(self, hier, molnames):

         """return data structure for the RMSD calculation"""

         moldict = {}

         mol_coords = {}

         mol_XYZs = {}

         for mol in IMP.pmi.tools.get_molecules(hier):

             name = mol.get_name()

             if name not in molnames:

                 continue

             parts = True

             mol_coords[mol] = []

             mol_XYZs[mol] = []

             i = 1

             while parts:

                 sel = IMP.atom.Selection(

                     mol, residue_index=i,

                     representation_type=IMP.atom.BALLS,

                     resolution=1)

                 parts = sel.get_selected_particles()

                 if parts:

                     mol_coords[mol].append(

                         IMP.core.XYZ(parts[0]).get_coordinates())

                     mol_XYZs[mol].append(IMP.core.XYZ(parts[0]))

                     i = i+1

             if name in moldict:

                 moldict[name].append(mol)

             else:

                 moldict[name] = [mol]

         return moldict, mol_coords, mol_XYZs


     def get_rmsd_and_assigments(self):

         total_rmsd = 0

         total_N = 0

         best_assignments = []

         rmsd_dict = {}

         for molname, ref_mols in self.moldict1.items():

             ref_coords = []

             for ref_mol in ref_mols:

                 if self.dynamic1:

                     coord1 = [XYZ.get_coordinates()

                               for XYZ in self.mol_XYZs1[ref_mol]]

                 else:

                     coord1 = self.molcoords1[ref_mol]

                 ref_coords += coord1


             rmsd = []

             rmf_mols_list = []

             for rmf_mols in itertools.permutations(self.moldict0[molname]):

                 rmf_coords = []

                 for rmf_mol in rmf_mols:

                     if self.dynamic0:

                         coord0 = [XYZ.get_coordinates()

                                   for XYZ in self.mol_XYZs0[rmf_mol]]

                     else:

                         coord0 = self.molcoords0[rmf_mol]

                     rmf_coords += coord0


                 rmsd.append(IMP.algebra.get_rmsd(ref_coords, rmf_coords))

                 rmf_mols_list.append(rmf_mols)

             m = min(rmsd)

             rmf_mols_best_order = rmf_mols_list[rmsd.index(m)]


             for n,  (ref_mol, rmf_mol) in enumerate(

                     zip(ref_mols, rmf_mols_best_order)):

                 best_assignments.append((rmf_mol, ref_mol))

                 if self.dynamic0:

                     coord0 = [XYZ.get_coordinates()

                               for XYZ in self.mol_XYZs0[rmf_mol]]

                 else:

                     coord0 = self.molcoords0[rmf_mol]


                 if self.dynamic1:

                     coord1 = [XYZ.get_coordinates()

                               for XYZ in self.mol_XYZs1[ref_mol]]

                 else:

                     coord1 = self.molcoords1[ref_mol]

                 rmsd_pair = self.metric(coord1, coord0)

                 N = len(self.molcoords1[ref_mol])

                 total_N += N

                 total_rmsd += rmsd_pair*rmsd_pair*N

                 rmsd_dict[ref_mol] = rmsd_pair

         total_rmsd = sqrt(total_rmsd/total_N)

         return total_rmsd, best_assignments


     def get_output(self):

         """Returns output for IMP.pmi.output.Output object"""

         total_rmsd, best_assignments = self.get_rmsd_and_assigments()


         assignments_out = []

         for rmf_mol, ref_mol in best_assignments:

             ref_name = ref_mol.get_name()

             ref_copy = IMP.atom.Copy(ref_mol).get_copy_index()

             rmf_name = rmf_mol.get_name()

             rmf_copy = IMP.atom.Copy(rmf_mol).get_copy_index()

             assignments_out.append(rmf_name + "." + str(rmf_copy) + "->" +

                                    ref_name + "." + str(ref_copy))

         return {"RMSD_" + self.label: str(total_rmsd),

                 "RMSD_assignments_" + self.label: str(assignments_out)}


 class Precision:

     """A class to evaluate the precision of an ensemble.


     Also can evaluate the cross-precision of multiple ensembles.

     Supports MPI for coordinate reading.

     Recommended procedure:

       -# initialize object and pass the selection for evaluating precision

       -# call add_structures() to read in the data (specify group name)

       -# call get_precision() to evaluate inter/intra precision

       -# call get_rmsf() to evaluate within-group fluctuations

     """

     def __init__(self, model, resolution=1, selection_dictionary={}):

         """Constructor.

            @param model The IMP Model

            @param resolution Use 1 or 10 (kluge: requires that "_Res:X" is

                   part of the hier name)

            @param selection_dictionary Dictionary where keys are names for

                   selections and values are selection tuples for scoring

                   precision. "All" is automatically made as well

         """

         try:

             from mpi4py import MPI

             self.comm = MPI.COMM_WORLD

             self.rank = self.comm.Get_rank()

             self.number_of_processes = self.comm.size

         except ImportError:

             self.number_of_processes = 1

             self.rank = 0


         self.styles = ['pairwise_rmsd', 'pairwise_drmsd_k', 'pairwise_drmsd_Q',

                        'pairwise_drms_k', 'pairwise_rmsd', 'drmsd_from_center']

         self.style = 'pairwise_drmsd_k'

         self.structures_dictionary = {}

         self.reference_structures_dictionary = {}

         self.prots = []

         self.protein_names = None

         self.len_particles_resolution_one = None

         self.model = model

         self.rmf_names_frames = {}

         self.reference_rmf_names_frames = None

         self.reference_structure = None

         self.reference_prot = None

         self.selection_dictionary = selection_dictionary

         self.threshold = 40.0

         self.residue_particle_index_map = None

         self.prots = None

         if resolution in [1, 10]:

             self.resolution = resolution

         else:

             raise KeyError("Currently only allow resolution 1 or 10")


     def _get_structure(self, rmf_frame_index, rmf_name):

         """Read an RMF file and return the particles"""

         rh = RMF.open_rmf_file_read_only(rmf_name)

         if not self.prots:

             print("getting coordinates for frame %i rmf file %s"

                   % (rmf_frame_index, rmf_name))

             self.prots = IMP.rmf.create_hierarchies(rh, self.model)

             IMP.rmf.load_frame(rh, RMF.FrameID(rmf_frame_index))

         else:

             print("linking coordinates for frame %i rmf file %s"

                   % (rmf_frame_index, rmf_name))

             IMP.rmf.link_hierarchies(rh, self.prots)

             IMP.rmf.load_frame(rh, RMF.FrameID(rmf_frame_index))

         del rh


         if self.resolution == 1:

             particle_dict = get_particles_at_resolution_one(self.prots[0])

         elif self.resolution == 10:

             particle_dict = get_particles_at_resolution_ten(self.prots[0])


         protein_names = list(particle_dict.keys())

         particles_resolution_one = []

         for k in particle_dict:

             particles_resolution_one += (particle_dict[k])


         if self.protein_names is None:

             self.protein_names = protein_names

         else:

             if self.protein_names != protein_names:

                 print("Error: the protein names of the new coordinate set "

                       "is not compatible with the previous one")


         if self.len_particles_resolution_one is None:

             self.len_particles_resolution_one = len(particles_resolution_one)

         else:

             if self.len_particles_resolution_one != \

                     len(particles_resolution_one):

                 raise ValueError("the new coordinate set is not compatible "

                                  "with the previous one")


         return particles_resolution_one


     def add_structure(self,

                       rmf_name,

                       rmf_frame_index,

                       structure_set_name,

                       setup_index_map=False):

         """ Read a structure into the ensemble and store (as coordinates).

         @param rmf_name The name of the RMF file

         @param rmf_frame_index The frame to read

         @param structure_set_name Name for the set that includes this structure

                                   (e.g. "cluster 1")

         @param setup_index_map if requested, set up a dictionary to help

                find residue indexes

         """


         # decide where to put this structure

         if structure_set_name in self.structures_dictionary:

             cdict = self.structures_dictionary[structure_set_name]

             rmflist = self.rmf_names_frames[structure_set_name]

         else:

             self.structures_dictionary[structure_set_name] = {}

             self.rmf_names_frames[structure_set_name] = []

             cdict = self.structures_dictionary[structure_set_name]

             rmflist = self.rmf_names_frames[structure_set_name]


         # read the particles

         try:

             particles_resolution_one = self._get_structure(

                 rmf_frame_index, rmf_name)

         except ValueError:

             print("something wrong with the rmf")

             return 0

         self.selection_dictionary.update({"All": self.protein_names})


         for selection_name in self.selection_dictionary:

             selection_tuple = self.selection_dictionary[selection_name]

             coords = self._select_coordinates(

                 selection_tuple, particles_resolution_one, self.prots[0])

             if selection_name not in cdict:

                 cdict[selection_name] = [coords]

             else:

                 cdict[selection_name].append(coords)


         rmflist.append((rmf_name, rmf_frame_index))


         # if requested, set up a dictionary to help find residue indexes

         if setup_index_map:

             self.residue_particle_index_map = {}

             for prot_name in self.protein_names:

                 self.residue_particle_index_map[prot_name] = \

                        self._get_residue_particle_index_map(

                            prot_name, particles_resolution_one, self.prots[0])


     def add_structures(self,

                        rmf_name_frame_tuples,

                        structure_set_name):

         """Read a list of RMFs, supports parallel

         @param rmf_name_frame_tuples list of (rmf_file_name,frame_number)

         @param structure_set_name Name this set of structures

                (e.g. "cluster.1")

         """


         # split up the requested list to read in parallel

         my_rmf_name_frame_tuples = IMP.pmi.tools.chunk_list_into_segments(

             rmf_name_frame_tuples, self.number_of_processes)[self.rank]

         for nfr, tup in enumerate(my_rmf_name_frame_tuples):

             rmf_name = tup[0]

             rmf_frame_index = tup[1]

             # the first frame stores the map between residues and particles

             if self.residue_particle_index_map is None:

                 setup_index_map = True

             else:

                 setup_index_map = False

             self.add_structure(rmf_name,

                                rmf_frame_index,

                                structure_set_name,

                                setup_index_map)


         # synchronize the structures

         if self.number_of_processes > 1:

             self.rmf_names_frames = IMP.pmi.tools.scatter_and_gather(

                 self.rmf_names_frames)

             if self.rank != 0:

                 self.comm.send(self.structures_dictionary, dest=0, tag=11)

             elif self.rank == 0:

                 for i in range(1, self.number_of_processes):

                     data_tmp = self.comm.recv(source=i, tag=11)

                     for key in self.structures_dictionary:

                         self.structures_dictionary[key].update(data_tmp[key])

                 for i in range(1, self.number_of_processes):

                     self.comm.send(self.structures_dictionary, dest=i, tag=11)

             if self.rank != 0:

                 self.structures_dictionary = self.comm.recv(source=0, tag=11)


     def _get_residue_particle_index_map(self, prot_name, structure, hier):

         # Creates map from all particles to residue numbers

         residue_particle_index_map = []

         s = IMP.atom.Selection(hier, molecules=[prot_name], resolution=1)

         all_selected_particles = s.get_selected_particles()

         intersection = list(set(all_selected_particles) & set(structure))

         sorted_intersection = IMP.pmi.tools.sort_by_residues(intersection)

         for p in sorted_intersection:

             residue_particle_index_map.append(

                 IMP.pmi.tools.get_residue_indexes(p))

         return residue_particle_index_map


     def _select_coordinates(self, tuple_selections, structure, prot):

         selected_coordinates = []

         for t in tuple_selections:

             if isinstance(t, tuple) and len(t) == 3:

                 s = IMP.atom.Selection(

                     prot, molecules=[t[2]],

                     residue_indexes=range(t[0], t[1]+1), resolution=1)

                 all_selected_particles = s.get_selected_particles()

                 intersection = list(set(all_selected_particles)

                                     & set(structure))

                 sorted_intersection = \

                     IMP.pmi.tools.sort_by_residues(intersection)

                 cc = [tuple(IMP.core.XYZ(p).get_coordinates())

                       for p in sorted_intersection]

                 selected_coordinates += cc

             elif isinstance(t, str):

                 s = IMP.atom.Selection(prot, molecules=[t], resolution=1)

                 all_selected_particles = s.get_selected_particles()

                 intersection = list(set(all_selected_particles)

                                     & set(structure))

                 sorted_intersection = \

                     IMP.pmi.tools.sort_by_residues(intersection)

                 cc = [tuple(IMP.core.XYZ(p).get_coordinates())

                       for p in sorted_intersection]

                 selected_coordinates += cc

             else:

                 raise ValueError("Selection error")

         return selected_coordinates


     def set_threshold(self, threshold):

         self.threshold = threshold


     def _get_distance(self, structure_set_name1, structure_set_name2,

                       selection_name, index1, index2):

         """Compute distance between structures with various metrics"""

         c1 = self.structures_dictionary[

             structure_set_name1][selection_name][index1]

         c2 = self.structures_dictionary[

             structure_set_name2][selection_name][index2]

         coordinates1 = [IMP.algebra.Vector3D(c) for c in c1]

         coordinates2 = [IMP.algebra.Vector3D(c) for c in c2]


         if self.style == 'pairwise_drmsd_k':

             distance = IMP.atom.get_drmsd(coordinates1, coordinates2)

         if self.style == 'pairwise_drms_k':

             distance = IMP.atom.get_drms(coordinates1, coordinates2)

         if self.style == 'pairwise_drmsd_Q':

             distance = IMP.atom.get_drmsd_Q(coordinates1, coordinates2,

                                             self.threshold)


         if self.style == 'pairwise_rmsd':

             distance = IMP.algebra.get_rmsd(coordinates1, coordinates2)

         return distance


     def _get_particle_distances(self, structure_set_name1, structure_set_name2,

                                 selection_name, index1, index2):

         c1 = self.structures_dictionary[

             structure_set_name1][selection_name][index1]

         c2 = self.structures_dictionary[

             structure_set_name2][selection_name][index2]


         coordinates1 = [IMP.algebra.Vector3D(c) for c in c1]

         coordinates2 = [IMP.algebra.Vector3D(c) for c in c2]


         distances = [np.linalg.norm(a-b)

                      for (a, b) in zip(coordinates1, coordinates2)]


         return distances


     def get_precision(self, structure_set_name1, structure_set_name2,

                       outfile=None, skip=1, selection_keywords=None):

         """ Evaluate the precision of two named structure groups. Supports MPI.

         When the structure_set_name1 is different from the structure_set_name2,

         this evaluates the cross-precision (average pairwise distances).

         @param outfile Name of the precision output file

         @param structure_set_name1  string name of the first structure set

         @param structure_set_name2  string name of the second structure set

         @param skip analyze every (skip) structure for the distance

                matrix calculation

         @param selection_keywords Specify the selection name you want to

                calculate on. By default this is computed for everything

                you provided in the constructor, plus all the subunits together.

         """

         if selection_keywords is None:

             sel_keys = list(self.selection_dictionary.keys())

         else:

             for k in selection_keywords:

                 if k not in self.selection_dictionary:

                     raise KeyError(

                         "you are trying to find named selection "

                         + k + " which was not requested in the constructor")

             sel_keys = selection_keywords


         if outfile is not None:

             of = open(outfile, "w")

         centroid_index = 0

         for selection_name in sel_keys:

             number_of_structures_1 = len(self.structures_dictionary[

                 structure_set_name1][selection_name])

             number_of_structures_2 = len(self.structures_dictionary[

                 structure_set_name2][selection_name])

             distances = {}

             structure_pointers_1 = list(range(0, number_of_structures_1, skip))

             structure_pointers_2 = list(range(0, number_of_structures_2, skip))

             pair_combination_list = list(

                 itertools.product(structure_pointers_1, structure_pointers_2))

             if len(pair_combination_list) == 0:

                 raise ValueError("no structure selected. Check the "

                                  "skip parameter.")


             # compute pairwise distances in parallel

             my_pair_combination_list = IMP.pmi.tools.chunk_list_into_segments(

                 pair_combination_list, self.number_of_processes)[self.rank]

             for n, pair in enumerate(my_pair_combination_list):

                 distances[pair] = self._get_distance(

                     structure_set_name1, structure_set_name2, selection_name,

                     pair[0], pair[1])

             if self.number_of_processes > 1:

                 distances = IMP.pmi.tools.scatter_and_gather(distances)


             # Finally compute distance to centroid

             if self.rank == 0:

                 if structure_set_name1 == structure_set_name2:

                     structure_pointers = structure_pointers_1

                     number_of_structures = number_of_structures_1


                     # calculate the distance from the first centroid

                     #  and determine the centroid

                     distance = 0.0

                     distances_to_structure = {}

                     distances_to_structure_normalization = {}


                     for n in structure_pointers:

                         distances_to_structure[n] = 0.0

                         distances_to_structure_normalization[n] = 0


                     for k in distances:

                         distance += distances[k]

                         distances_to_structure[k[0]] += distances[k]

                         distances_to_structure[k[1]] += distances[k]

                         distances_to_structure_normalization[k[0]] += 1

                         distances_to_structure_normalization[k[1]] += 1


                     for n in structure_pointers:

                         distances_to_structure[n] = \

                                 distances_to_structure[n] \

                                 / distances_to_structure_normalization[n]


                     min_distance = min([distances_to_structure[n]

                                        for n in distances_to_structure])

                     centroid_index = [k for k, v in

                                       distances_to_structure.items()

                                       if v == min_distance][0]

                     centroid_rmf_name = \

                         self.rmf_names_frames[

                             structure_set_name1][centroid_index]


                     centroid_distance = 0.0

                     distance_list = []

                     for n in range(number_of_structures):

                         dist = self._get_distance(

                             structure_set_name1, structure_set_name1,

                             selection_name, centroid_index, n)

                         centroid_distance += dist

                         distance_list.append(dist)


                     centroid_distance /= number_of_structures

                     if outfile is not None:

                         of.write(str(selection_name) + " " +

                                  structure_set_name1 +

                                  " average centroid distance " +

                                  str(centroid_distance) + "\n")

                         of.write(str(selection_name) + " " +

                                  structure_set_name1 + " centroid index " +

                                  str(centroid_index) + "\n")

                         of.write(str(selection_name) + " " +

                                  structure_set_name1 + " centroid rmf name " +

                                  str(centroid_rmf_name) + "\n")

                         of.write(str(selection_name) + " " +

                                  structure_set_name1 +

                                  " median centroid distance  " +

                                  str(np.median(distance_list)) + "\n")


                 average_pairwise_distances = sum(

                     distances.values())/len(list(distances.values()))

                 if outfile is not None:

                     of.write(str(selection_name) + " " + structure_set_name1 +

                              " " + structure_set_name2 +

                              " average pairwise distance " +

                              str(average_pairwise_distances) + "\n")

         if outfile is not None:

             of.close()

         return centroid_index


     def get_rmsf(self, structure_set_name, outdir="./", skip=1,

                  set_plot_yaxis_range=None):

         """ Calculate the residue mean square fluctuations (RMSF).

         Automatically outputs as data file and pdf

         @param structure_set_name Which structure set to calculate RMSF for

         @param outdir Where to write the files

         @param skip Skip this number of structures

         @param set_plot_yaxis_range In case you need to change the plot

         """

         # get the centroid structure for the whole complex

         centroid_index = self.get_precision(structure_set_name,

                                             structure_set_name,

                                             outfile=None,

                                             skip=skip)

         if self.rank == 0:

             for sel_name in self.protein_names:

                 self.selection_dictionary.update({sel_name: [sel_name]})

                 try:

                     number_of_structures = \

                         len(self.structures_dictionary[

                             structure_set_name][sel_name])

                 except KeyError:

                     # that protein was not included in the selection

                     continue

                 rpim = self.residue_particle_index_map[sel_name]

                 outfile = outdir+"/rmsf."+sel_name+".dat"

                 of = open(outfile, "w")

                 residue_distances = {}

                 residue_nblock = {}

                 for index in range(number_of_structures):

                     distances = self._get_particle_distances(

                         structure_set_name, structure_set_name, sel_name,

                         centroid_index, index)

                     for nblock, block in enumerate(rpim):

                         for residue_number in block:

                             residue_nblock[residue_number] = nblock

                             if residue_number not in residue_distances:

                                 residue_distances[residue_number] = \

                                     [distances[nblock]]

                             else:

                                 residue_distances[

                                     residue_number].append(distances[nblock])


                 residues = []

                 rmsfs = []

                 for rn in residue_distances:

                     residues.append(rn)

                     rmsf = np.std(residue_distances[rn])

                     rmsfs.append(rmsf)

                     of.write(str(rn) + " " + str(residue_nblock[rn])

                              + " " + str(rmsf) + "\n")


                 IMP.pmi.output.plot_xy_data(

                     residues, rmsfs, title=sel_name,

                     out_fn=outdir+"/rmsf."+sel_name, display=False,

                     set_plot_yaxis_range=set_plot_yaxis_range,

                     xlabel='Residue Number', ylabel='Standard error')

                 of.close()


     def set_reference_structure(self, rmf_name, rmf_frame_index):

         """Read in a structure used for reference computation.

         Needed before calling get_average_distance_wrt_reference_structure()

         @param rmf_name The RMF file to read the reference

         @param rmf_frame_index The index in that file

         """

         particles_resolution_one = self._get_structure(rmf_frame_index,

                                                        rmf_name)

         self.reference_rmf_names_frames = (rmf_name, rmf_frame_index)


         for selection_name in self.selection_dictionary:

             selection_tuple = self.selection_dictionary[selection_name]

             coords = self._select_coordinates(

                 selection_tuple, particles_resolution_one, self.prots[0])

             self.reference_structures_dictionary[selection_name] = coords


     def get_rmsd_wrt_reference_structure_with_alignment(

             self, structure_set_name, alignment_selection_key):

         """First align then calculate RMSD

         @param structure_set_name: the name of the structure set

         @param alignment_selection_key: the key containing the selection tuples

                needed to make the alignment stored in self.selection_dictionary

         @return: for each structure in the structure set, returns the rmsd

         """

         ret = {}

         if self.reference_structures_dictionary == {}:

             print("Cannot compute until you set a reference structure")

             return


         align_reference_coordinates = \

             self.reference_structures_dictionary[alignment_selection_key]

         align_coordinates = self.structures_dictionary[

             structure_set_name][alignment_selection_key]

         transformations = []

         for c in align_coordinates:

             Ali = IMP.pmi.analysis.Alignment(

                 {"All": align_reference_coordinates}, {"All": c})

             transformation = Ali.align()[1]

             transformations.append(transformation)

         for selection_name in self.selection_dictionary:

             reference_coordinates = \

                 self.reference_structures_dictionary[selection_name]

             coordinates2 = [IMP.algebra.Vector3D(c)

                             for c in reference_coordinates]

             distances = []

             for n, sc in enumerate(self.structures_dictionary[

                     structure_set_name][selection_name]):

                 coordinates1 = [

                     transformations[n].get_transformed(IMP.algebra.Vector3D(c))

                     for c in sc]

                 distance = IMP.algebra.get_rmsd(coordinates1, coordinates2)

                 distances.append(distance)

             ret[selection_name] = {

                 'all_distances': distances,

                 'average_distance': sum(distances)/len(distances),

                 'minimum_distance': min(distances)}

         return ret


     def _get_median(self, list_of_values):

         return np.median(np.array(list_of_values))


     def get_average_distance_wrt_reference_structure(self, structure_set_name):

         """Compare the structure set to the reference structure.

         @param structure_set_name The structure set to compute this on

         @note First call set_reference_structure()

         """

         ret = {}

         if self.reference_structures_dictionary == {}:

             print("Cannot compute until you set a reference structure")

             return

         for selection_name in self.selection_dictionary:

             reference_coordinates = self.reference_structures_dictionary[

                 selection_name]

             coordinates2 = [IMP.algebra.Vector3D(c)

                             for c in reference_coordinates]

             distances = []

             for sc in self.structures_dictionary[

                     structure_set_name][selection_name]:

                 coordinates1 = [IMP.algebra.Vector3D(c) for c in sc]

                 if self.style == 'pairwise_drmsd_k':

                     distance = IMP.atom.get_drmsd(coordinates1, coordinates2)

                 if self.style == 'pairwise_drms_k':

                     distance = IMP.atom.get_drms(coordinates1, coordinates2)

                 if self.style == 'pairwise_drmsd_Q':

                     distance = IMP.atom.get_drmsd_Q(

                         coordinates1, coordinates2, self.threshold)

                 if self.style == 'pairwise_rmsd':

                     distance = IMP.algebra.get_rmsd(coordinates1, coordinates2)

                 distances.append(distance)


             print(selection_name, "average distance",

                   sum(distances)/len(distances), "minimum distance",

                   min(distances), 'nframes', len(distances))

             ret[selection_name] = {

                 'average_distance': sum(distances)/len(distances),

                 'minimum_distance': min(distances)}

         return ret


     def get_coordinates(self):

         pass


     def set_precision_style(self, style):

         if style in self.styles:

             self.style = style

         else:

             raise ValueError("No such style")


 class GetModelDensity:

     """Compute mean density maps from structures.


     Keeps a dictionary of density maps,

     keys are in the custom ranges. When you call add_subunits_density, it adds

     particle coordinates to the existing density maps.

     """


     def __init__(self, custom_ranges, representation=None, resolution=20.0,

                  voxel=5.0):

         """Constructor.

            @param custom_ranges  Required. It's a dictionary, keys are the

                   density component names, values are selection tuples

                           e.g. {'kin28':[['kin28',1,-1]],

                                'density_name_1' :[('ccl1')],

                                'density_name_2' :[(1,142,'tfb3d1'),

                                                   (143,700,'tfb3d2')],

            @param resolution The MRC resolution of the output map (in

                   Angstrom unit)

            @param voxel The voxel size for the output map (lower is slower)

         """


         if representation:

             IMP.handle_use_deprecated(

                 "The 'representation' argument is deprecated "

                 "(and is ignored). Pass hierarchies to "

                 "add_subunits_density() instead.")

         self.MRCresolution = resolution

         self.voxel = voxel

         self.densities = {}

         self.count_models = 0.0

         self.custom_ranges = custom_ranges


     def add_subunits_density(self, hierarchy):

         """Add a frame to the densities.

         @param hierarchy The hierarchy to add.

         """

         self.count_models += 1.0


         part_dict = get_particles_at_resolution_one(hierarchy)

         all_particles_by_resolution = []

         for name in part_dict:

             all_particles_by_resolution += part_dict[name]


         for density_name in self.custom_ranges:

             parts = []

             all_particles_by_segments = []


             for seg in self.custom_ranges[density_name]:

                 if isinstance(seg, str):

                     s = IMP.atom.Selection(hierarchy, molecule=seg)

                 elif isinstance(seg, tuple) and len(seg) == 2:

                     s = IMP.atom.Selection(

                         hierarchy, molecule=seg[0], copy_index=seg[1])

                 elif isinstance(seg, tuple) and len(seg) == 3:

                     s = IMP.atom.Selection(

                         hierarchy, molecule=seg[2],

                         residue_indexes=range(seg[0], seg[1] + 1))

                 elif isinstance(seg, tuple) and len(seg) == 4:

                     s = IMP.atom.Selection(

                         hierarchy, molecule=seg[2],

                         residue_indexes=range(seg[0], seg[1] + 1),

                         copy_index=seg[3])

                 else:

                     raise Exception('could not understand selection tuple '

                                     + str(seg))


                 all_particles_by_segments += s.get_selected_particles()

             parts = all_particles_by_segments

             self._create_density_from_particles(parts, density_name)


     def normalize_density(self):

         pass


     def _create_density_from_particles(self, ps, name,

                                        kernel_type='GAUSSIAN'):

         '''Internal function for adding to densities.

         pass XYZR particles with mass and create a density from them.

         kernel type options are GAUSSIAN, BINARIZED_SPHERE, and SPHERE.'''

         dmap = IMP.em.SampledDensityMap(ps, self.MRCresolution, self.voxel)

         dmap.calcRMS()

         dmap.set_was_used(True)

         if name not in self.densities:

             self.densities[name] = dmap

         else:

             bbox1 = IMP.em.get_bounding_box(self.densities[name])

             bbox2 = IMP.em.get_bounding_box(dmap)

             bbox1 += bbox2

             dmap3 = IMP.em.create_density_map(bbox1, self.voxel)

             dmap3.set_was_used(True)

             dmap3.add(dmap)

             dmap3.add(self.densities[name])

             self.densities[name] = dmap3


     def get_density_keys(self):

         return list(self.densities.keys())


     def get_density(self, name):

         """Get the current density for some component name"""

         if name not in self.densities:

             return None

         else:

             return self.densities[name]


     def write_mrc(self, path="./", suffix=None):

         import os

         import errno

         for density_name in self.densities:

             self.densities[density_name].multiply(1. / self.count_models)

             if suffix is None:

                 name = path + "/" + density_name + ".mrc"

             else:

                 name = path + "/" + density_name + "." + suffix + ".mrc"

             path, file = os.path.split(name)

             if not os.path.exists(path):

                 try:

                     os.makedirs(path)

                 except OSError as e:

                     if e.errno != errno.EEXIST:

                         raise

             IMP.em.write_map(self.densities[density_name], name,

                              IMP.em.MRCReaderWriter())


 class GetContactMap:


     def __init__(self, distance=15.):

         self.distance = distance

         self.contactmap = ''

         self.namelist = []

         self.xlinks = 0

         self.XL = {}

         self.expanded = {}

         self.resmap = {}


     def set_prot(self, prot):

         self.prot = prot

         self.protnames = []


         particles_dictionary = get_particles_at_resolution_one(self.prot)


         for name in particles_dictionary:

             residue_indexes = []

             for p in particles_dictionary[name]:

                 print(p.get_name())

                 residue_indexes.extend(IMP.pmi.tools.get_residue_indexes(p))


             if len(residue_indexes) != 0:

                 self.protnames.append(name)


     def get_subunit_coords(self, frame, align=0):

         from scipy.spatial.distance import cdist

         coords = []

         radii = []

         namelist = []

         for part in self.prot.get_children():

             SortedSegments = []

             print(part)

             for chl in part.get_children():

                 start = IMP.atom.get_leaves(chl)[0]


                 startres = IMP.atom.Fragment(start).get_residue_indexes()[0]

                 SortedSegments.append((chl, startres))

             SortedSegments = sorted(SortedSegments, key=itemgetter(1))


             for sgmnt in SortedSegments:

                 for leaf in IMP.atom.get_leaves(sgmnt[0]):

                     p = IMP.core.XYZR(leaf)

                     crd = np.array([p.get_x(), p.get_y(), p.get_z()])


                     coords.append(crd)

                     radii.append(p.get_radius())


                     new_name = part.get_name() + '_' + sgmnt[0].get_name() + \

                         '_' + \

                         str(IMP.atom.Fragment(leaf)

                             .get_residue_indexes()[0])

                     namelist.append(new_name)

                     self.expanded[new_name] = len(

                         IMP.atom.Fragment(leaf).get_residue_indexes())

                     if part.get_name() not in self.resmap:

                         self.resmap[part.get_name()] = {}

                     for res in IMP.atom.Fragment(leaf).get_residue_indexes():

                         self.resmap[part.get_name()][res] = new_name


         coords = np.array(coords)

         radii = np.array(radii)

         if len(self.namelist) == 0:

             self.namelist = namelist

             self.contactmap = np.zeros((len(coords), len(coords)))

         distances = cdist(coords, coords)

         distances = (distances - radii).T - radii

         distances = distances <= self.distance

         self.contactmap += distances


     def add_xlinks(self, filename,

                    identification_string='ISDCrossLinkMS_Distance_'):

         self.xlinks = 1

         with open(filename) as data:

             D = data.readlines()


         for d in D:

             if identification_string in d:

                 d = d.replace("_", " ").replace("-", " ").replace(

                     ":", " ").split()


                 t1, t2 = (d[0], d[1]), (d[1], d[0])

                 if t1 not in self.XL:

                     self.XL[t1] = [(int(d[2]) + 1, int(d[3]) + 1)]

                     self.XL[t2] = [(int(d[3]) + 1, int(d[2]) + 1)]

                 else:

                     self.XL[t1].append((int(d[2]) + 1, int(d[3]) + 1))

                     self.XL[t2].append((int(d[3]) + 1, int(d[2]) + 1))


     def dist_matrix(self, skip_cmap=0, skip_xl=1, outname=None):

         K = self.namelist

         M = self.contactmap

         C, R = [], []

         proteins = self.protnames


         # exp new

         if skip_cmap == 0:

             Matrices = {}

             proteins = [p.get_name() for p in self.prot.get_children()]

             missing = []

             for p1 in range(len(proteins)):

                 for p2 in range(p1, len(proteins)):

                     pl1, pl2 = (max(self.resmap[proteins[p1]].keys()),

                                 max(self.resmap[proteins[p2]].keys()))

                     pn1, pn2 = proteins[p1], proteins[p2]

                     mtr = np.zeros((pl1 + 1, pl2 + 1))

                     print('Creating matrix for: ',

                           p1, p2, pn1, pn2, mtr.shape, pl1, pl2)

                     for i1 in range(1, pl1 + 1):

                         for i2 in range(1, pl2 + 1):

                             try:

                                 r1 = K.index(self.resmap[pn1][i1])

                                 r2 = K.index(self.resmap[pn2][i2])

                                 r = M[r1, r2]

                                 mtr[i1 - 1, i2 - 1] = r

                             except KeyError:

                                 missing.append((pn1, pn2, i1, i2))

                     Matrices[(pn1, pn2)] = mtr


         # add cross-links

         if skip_xl == 0:

             if self.XL == {}:

                 raise ValueError("cross-links were not provided, use "

                                  "add_xlinks function!")

             Matrices_xl = {}

             for p1 in range(len(proteins)):

                 for p2 in range(p1, len(proteins)):

                     pl1, pl2 = (max(self.resmap[proteins[p1]].keys()),

                                 max(self.resmap[proteins[p2]].keys()))

                     pn1, pn2 = proteins[p1], proteins[p2]

                     mtr = np.zeros((pl1 + 1, pl2 + 1))

                     flg = 0

                     try:

                         xls = self.XL[(pn1, pn2)]

                     except KeyError:

                         try:

                             xls = self.XL[(pn2, pn1)]

                             flg = 1

                         except KeyError:

                             flg = 2

                     if flg == 0:

                         print('Creating matrix for: ',

                               p1, p2, pn1, pn2, mtr.shape, pl1, pl2)

                         for xl1, xl2 in xls:

                             if xl1 > pl1:

                                 print('X' * 10, xl1, xl2)

                                 xl1 = pl1

                             if xl2 > pl2:

                                 print('X' * 10, xl1, xl2)

                                 xl2 = pl2

                             mtr[xl1 - 1, xl2 - 1] = 100

                     elif flg == 1:

                         print('Creating matrix for: ',

                               p1, p2, pn1, pn2, mtr.shape, pl1, pl2)

                         for xl1, xl2 in xls:

                             if xl1 > pl1:

                                 print('X' * 10, xl1, xl2)

                                 xl1 = pl1

                             if xl2 > pl2:

                                 print('X' * 10, xl1, xl2)

                                 xl2 = pl2

                             mtr[xl2 - 1, xl1 - 1] = 100

                     else:

                         print('No cross-links between: ', pn1, pn2)

                     Matrices_xl[(pn1, pn2)] = mtr


         # make list of protein names and create coordinate lists

         C = proteins

         # W is the component length list,

         # R is the contiguous coordinates list

         W, R = [], []

         for i, c in enumerate(C):

             cl = max(self.resmap[c].keys())

             W.append(cl)

             if i == 0:

                 R.append(cl)

             else:

                 R.append(R[-1] + cl)


         # start plotting

         if outname:

             # Don't require a display

             import matplotlib as mpl

             mpl.use('Agg')

         import matplotlib.pyplot as plt

         import matplotlib.gridspec as gridspec

         import scipy.sparse as sparse


         plt.figure()

         gs = gridspec.GridSpec(len(W), len(W),

                                width_ratios=W,

                                height_ratios=W)


         cnt = 0

         for x1, r1 in enumerate(R):

             for x2, r2 in enumerate(R):

                 ax = plt.subplot(gs[cnt])

                 if skip_cmap == 0:

                     try:

                         mtr = Matrices[(C[x1], C[x2])]

                     except KeyError:

                         mtr = Matrices[(C[x2], C[x1])].T

                     ax.imshow(

                         np.log(mtr),

                         interpolation='nearest',

                         vmin=0.,

                         vmax=log(mtr.max()) if mtr.max() > 0. else 1.)

                     ax.set_xticks([])

                     ax.set_yticks([])

                 if skip_xl == 0:

                     try:

                         mtr = Matrices_xl[(C[x1], C[x2])]

                     except KeyError:

                         mtr = Matrices_xl[(C[x2], C[x1])].T

                     ax.spy(

                         sparse.csr_matrix(mtr),

                         markersize=10,

                         color='white',

                         linewidth=100,

                         alpha=0.5)

                     ax.set_xticks([])

                     ax.set_yticks([])


                 cnt += 1

                 if x2 == 0:

                     ax.set_ylabel(C[x1], rotation=90)

         if outname:

             plt.savefig(outname + ".pdf", dpi=300, transparent="False")

         else:

             plt.show()


 # ------------------------------------------------------------------

 # a few random tools


 def get_hiers_from_rmf(model, frame_number, rmf_file):

     # I have to deprecate this function

     print("getting coordinates for frame %i rmf file %s"

           % (frame_number, rmf_file))


     # load the frame

     rh = RMF.open_rmf_file_read_only(rmf_file)


     try:

         prots = IMP.rmf.create_hierarchies(rh, model)

     except IOError:

         print("Unable to open rmf file %s" % (rmf_file))

         return None


     try:

         IMP.rmf.load_frame(rh, RMF.FrameID(frame_number))

     except IOError:

         print("Unable to open frame %i of file %s" % (frame_number, rmf_file))

         return None

     model.update()

     del rh


     return prots


 def link_hiers_to_rmf(model, hiers, frame_number, rmf_file):

     print("linking hierarchies for frame %i rmf file %s"

           % (frame_number, rmf_file))

     rh = RMF.open_rmf_file_read_only(rmf_file)

     IMP.rmf.link_hierarchies(rh, hiers)

     try:

         IMP.rmf.load_frame(rh, RMF.FrameID(frame_number))

     except:  # noqa: E722

         print("Unable to open frame %i of file %s" % (frame_number, rmf_file))

         return False

     model.update()

     del rh

     return True


 def get_hiers_and_restraints_from_rmf(model, frame_number, rmf_file):

     # I have to deprecate this function

     print("getting coordinates for frame %i rmf file %s"

           % (frame_number, rmf_file))


     # load the frame

     rh = RMF.open_rmf_file_read_only(rmf_file)


     try:

         prots = IMP.rmf.create_hierarchies(rh, model)

         rs = IMP.rmf.create_restraints(rh, model)

     except IOError:

         print("Unable to open rmf file %s" % (rmf_file))

         return None, None

     try:

         IMP.rmf.load_frame(rh, RMF.FrameID(frame_number))

     except:  # noqa: E722

         print("Unable to open frame %i of file %s" % (frame_number, rmf_file))

         return None, None

     model.update()

     del rh


     return prots, rs


 def link_hiers_and_restraints_to_rmf(model, hiers, rs, frame_number, rmf_file):

     print("linking hierarchies for frame %i rmf file %s"

           % (frame_number, rmf_file))

     rh = RMF.open_rmf_file_read_only(rmf_file)

     IMP.rmf.link_hierarchies(rh, hiers)

     IMP.rmf.link_restraints(rh, rs)

     try:

         IMP.rmf.load_frame(rh, RMF.FrameID(frame_number))

     except:  # noqa: E722

         print("Unable to open frame %i of file %s" % (frame_number, rmf_file))

         return False

     model.update()

     del rh

     return True


 def get_particles_at_resolution_one(prot):

     """Get particles at res 1, or any beads, based on the name.

     No Representation is needed. This is mainly used when the hierarchy

     is read from an RMF file.


     @return a dictionary of component names and their particles

     @note If the root node is named "System" or is a "State", do

           proper selection.

     """

     particle_dict = {}


     # attempt to give good results for PMI2

     for mol in IMP.atom.get_by_type(prot, IMP.atom.MOLECULE_TYPE):

         sel = IMP.atom.Selection(mol, resolution=1)

         particle_dict[mol.get_name()] = sel.get_selected_particles()

     return particle_dict


 def get_particles_at_resolution_ten(prot):

     """Get particles at res 10, or any beads, based on the name.

     No Representation is needed.

     This is mainly used when the hierarchy is read from an RMF file.


     @return a dictionary of component names and their particles

     @note If the root node is named "System" or is a "State", do proper

           selection.

     """

     particle_dict = {}

     # attempt to give good results for PMI2

     for mol in IMP.atom.get_by_type(prot, IMP.atom.MOLECULE_TYPE):

         sel = IMP.atom.Selection(mol, resolution=10)

         particle_dict[mol.get_name()] = sel.get_selected_particles()

     return particle_dict


 class CrossLinkTable:

     """Visualization of cross-links"""

     def __init__(self):

         self.crosslinks = []

         self.external_csv_data = None

         self.crosslinkedprots = set()

         self.mindist = +10000000.0

         self.maxdist = -10000000.0

         self.contactmap = None


     def set_hierarchy(self, prot):

         self.prot_length_dict = {}

         self.model = prot.get_model()


         for i in prot.get_children():

             name = i.get_name()

             residue_indexes = []

             for p in IMP.atom.get_leaves(i):

                 residue_indexes.extend(IMP.pmi.tools.get_residue_indexes(p))


             if len(residue_indexes) != 0:

                 self.prot_length_dict[name] = max(residue_indexes)


     def set_coordinates_for_contact_map(self, rmf_name, rmf_frame_index):

         from scipy.spatial.distance import cdist


         rh = RMF.open_rmf_file_read_only(rmf_name)

         prots = IMP.rmf.create_hierarchies(rh, self.model)

         IMP.rmf.load_frame(rh, RMF.FrameID(rmf_frame_index))

         print("getting coordinates for frame %i rmf file %s"

               % (rmf_frame_index, rmf_name))

         del rh


         coords = []

         radii = []


         particles_dictionary = get_particles_at_resolution_one(prots[0])


         resindex = 0

         self.index_dictionary = {}


         for name in particles_dictionary:

             residue_indexes = []

             for p in particles_dictionary[name]:

                 print(p.get_name())

                 residue_indexes = IMP.pmi.tools.get_residue_indexes(p)


                 if len(residue_indexes) != 0:


                     for res in range(min(residue_indexes),

                                      max(residue_indexes) + 1):

                         d = IMP.core.XYZR(p)


                         crd = np.array([d.get_x(), d.get_y(), d.get_z()])

                         coords.append(crd)

                         radii.append(d.get_radius())

                         if name not in self.index_dictionary:

                             self.index_dictionary[name] = [resindex]

                         else:

                             self.index_dictionary[name].append(resindex)

                         resindex += 1


         coords = np.array(coords)

         radii = np.array(radii)


         distances = cdist(coords, coords)

         distances = (distances - radii).T - radii


         distances = np.where(distances <= 20.0, 1.0, 0)

         if self.contactmap is None:

             self.contactmap = np.zeros((len(coords), len(coords)))

         self.contactmap += distances


         for prot in prots:

             IMP.atom.destroy(prot)


     def set_crosslinks(

             self, data_file, search_label='ISDCrossLinkMS_Distance_',

             mapping=None, filter_label=None,

             # provide a list of rmf base names to filter the stat file

             filter_rmf_file_names=None, external_csv_data_file=None,

             external_csv_data_file_unique_id_key="Unique ID"):


         # example key ISDCrossLinkMS_Distance_intrarb_937-State:0-108:RPS3_55:RPS30-1-1-0.1_None   # noqa: E501

         # mapping is a dictionary that maps standard keywords to entry

         # positions in the key string

         # confidence class is a filter that

         # external datafile is a datafile that contains further information

         # on the cross-links

         # it will use the unique id to create the dictionary keys


         po = IMP.pmi.output.ProcessOutput(data_file)

         keys = po.get_keys()


         xl_keys = [k for k in keys if search_label in k]


         if filter_rmf_file_names is not None:

             rmf_file_key = "local_rmf_file_name"

             fs = po.get_fields(xl_keys+[rmf_file_key])

         else:

             fs = po.get_fields(xl_keys)


         # this dictionary stores the occurency of given cross-links

         self.cross_link_frequency = {}


         # this dictionary stores the series of distances for given cross-linked

         # residues

         self.cross_link_distances = {}


         # this dictionary stores the series of distances for given cross-linked

         # residues

         self.cross_link_distances_unique = {}


         if external_csv_data_file is not None:

             # this dictionary stores the further information on cross-links

             # labeled by unique ID

             self.external_csv_data = {}

             xldb = IMP.pmi.tools.get_db_from_csv(external_csv_data_file)


             for xl in xldb:

                 self.external_csv_data[

                     xl[external_csv_data_file_unique_id_key]] = xl


         # this list keeps track the tuple of cross-links and sample

         # so that we don't count twice the same cross-linked residues in the

         # same sample

         cross_link_frequency_list = []


         self.unique_cross_link_list = []


         for key in xl_keys:

             print(key)

             keysplit = key.replace("_", " ").replace("-", " ").replace(

                 ":", " ").split()


             if filter_label is not None:

                 if filter_label not in keysplit:

                     continue


             if mapping is None:

                 r1 = int(keysplit[5])

                 c1 = keysplit[6]

                 r2 = int(keysplit[7])

                 c2 = keysplit[8]

                 try:

                     confidence = keysplit[12]

                 except:  # noqa: E722

                     confidence = '0.0'

                 try:

                     unique_identifier = keysplit[3]

                 except:  # noqa: E722

                     unique_identifier = '0'

             else:

                 r1 = int(keysplit[mapping["Residue1"]])

                 c1 = keysplit[mapping["Protein1"]]

                 r2 = int(keysplit[mapping["Residue2"]])

                 c2 = keysplit[mapping["Protein2"]]

                 try:

                     confidence = keysplit[mapping["Confidence"]]

                 except:  # noqa: E722

                     confidence = '0.0'

                 try:

                     unique_identifier = keysplit[mapping["Unique Identifier"]]

                 except:  # noqa: E722

                     unique_identifier = '0'


             self.crosslinkedprots.add(c1)

             self.crosslinkedprots.add(c2)


             # construct the list of distances corresponding to the input rmf

             # files


             dists = []

             if filter_rmf_file_names is not None:

                 for n, d in enumerate(fs[key]):

                     if fs[rmf_file_key][n] in filter_rmf_file_names:

                         dists.append(float(d))

             else:

                 dists = [float(f) for f in fs[key]]


             # check if the input confidence class corresponds to the

             # one of the cross-link


             mdist = self.median(dists)


             stdv = np.std(np.array(dists))

             if self.mindist > mdist:

                 self.mindist = mdist

             if self.maxdist < mdist:

                 self.maxdist = mdist


             # calculate the frequency of unique cross-links within the same

             # sample

             if (r1, c1, r2, c2, mdist) not in cross_link_frequency_list:

                 if (r1, c1, r2, c2) not in self.cross_link_frequency:

                     self.cross_link_frequency[(r1, c1, r2, c2)] = 1

                     self.cross_link_frequency[(r2, c2, r1, c1)] = 1

                 else:

                     self.cross_link_frequency[(r2, c2, r1, c1)] += 1

                     self.cross_link_frequency[(r1, c1, r2, c2)] += 1

                 cross_link_frequency_list.append((r1, c1, r2, c2))

                 cross_link_frequency_list.append((r2, c2, r1, c1))

                 self.unique_cross_link_list.append(

                     (r1, c1, r2, c2, mdist))


             if (r1, c1, r2, c2) not in self.cross_link_distances:

                 self.cross_link_distances[(

                     r1, c1, r2, c2, mdist, confidence)] = dists

                 self.cross_link_distances[(

                     r2, c2, r1, c1, mdist, confidence)] = dists

                 self.cross_link_distances_unique[(r1, c1, r2, c2)] = dists

             else:

                 self.cross_link_distances[(

                     r2, c2, r1, c1, mdist, confidence)] += dists

                 self.cross_link_distances[(

                     r1, c1, r2, c2, mdist, confidence)] += dists


             self.crosslinks.append(

                 (r1, c1, r2, c2, mdist, stdv, confidence, unique_identifier,

                  'original'))

             self.crosslinks.append(

                 (r2, c2, r1, c1, mdist, stdv, confidence, unique_identifier,

                  'reversed'))


         self.cross_link_frequency_inverted = {}

         for xl in self.unique_cross_link_list:

             (r1, c1, r2, c2, mdist) = xl

             frequency = self.cross_link_frequency[(r1, c1, r2, c2)]

             if frequency not in self.cross_link_frequency_inverted:

                 self.cross_link_frequency_inverted[

                     frequency] = [(r1, c1, r2, c2)]

             else:

                 self.cross_link_frequency_inverted[

                     frequency].append((r1, c1, r2, c2))


         # -------------


     def median(self, mylist):

         sorts = sorted(mylist)

         length = len(sorts)

         print(length)

         if length == 1:

             return mylist[0]

         if not length % 2:

             return (sorts[length / 2] + sorts[length / 2 - 1]) / 2.0

         return sorts[length / 2]


     def set_threshold(self, threshold):

         self.threshold = threshold


     def set_tolerance(self, tolerance):

         self.tolerance = tolerance


     def colormap(self, dist):

         if dist < self.threshold - self.tolerance:

             return "Green"

         elif dist >= self.threshold + self.tolerance:

             return "Orange"

         else:

             return "Red"


     def write_cross_link_database(self, filename, format='csv'):

         import csv


         fieldnames = [

             "Unique ID", "Protein1", "Residue1", "Protein2", "Residue2",

             "Median Distance", "Standard Deviation", "Confidence",

             "Frequency", "Arrangement"]


         if self.external_csv_data is not None:

             keys = list(self.external_csv_data.keys())

             innerkeys = list(self.external_csv_data[keys[0]].keys())

             innerkeys.sort()

             fieldnames += innerkeys


         dw = csv.DictWriter(

             open(filename, "w"),

             delimiter=',',

             fieldnames=fieldnames)

         dw.writeheader()

         for xl in self.crosslinks:

             (r1, c1, r2, c2, mdist, stdv, confidence,

              unique_identifier, descriptor) = xl

             if descriptor == 'original':

                 outdict = {}

                 outdict["Unique ID"] = unique_identifier

                 outdict["Protein1"] = c1

                 outdict["Protein2"] = c2

                 outdict["Residue1"] = r1

                 outdict["Residue2"] = r2

                 outdict["Median Distance"] = mdist

                 outdict["Standard Deviation"] = stdv

                 outdict["Confidence"] = confidence

                 outdict["Frequency"] = self.cross_link_frequency[

                     (r1, c1, r2, c2)]

                 if c1 == c2:

                     arrangement = "Intra"

                 else:

                     arrangement = "Inter"

                 outdict["Arrangement"] = arrangement

                 if self.external_csv_data is not None:

                     outdict.update(self.external_csv_data[unique_identifier])


                 dw.writerow(outdict)


     def plot(self, prot_listx=None, prot_listy=None, no_dist_info=False,

              no_confidence_info=False, filter=None, layout="whole",

              crosslinkedonly=False, filename=None, confidence_classes=None,

              alphablend=0.1, scale_symbol_size=1.0, gap_between_components=0,

              rename_protein_map=None):

         # layout can be:

         #                "lowerdiagonal"  print only the lower diagonal plot

         #                "upperdiagonal"  print only the upper diagonal plot

         #                "whole"  print all

         # crosslinkedonly: plot only components that have cross-links

         # no_dist_info: if True will plot only the cross-links as grey spots

         # filter = tuple the tuple contains a keyword to be search

         # in the database

         #                a relationship ">","==","<"

         #                and a value

         #                example ("ID_Score",">",40)

         # scale_symbol_size rescale the symbol for the cross-link

         # rename_protein_map is a dictionary to rename proteins


         import matplotlib as mpl

         mpl.use('Agg')

         import matplotlib.pyplot as plt

         import matplotlib.cm as cm


         fig = plt.figure(figsize=(10, 10))

         ax = fig.add_subplot(111)


         ax.set_xticks([])

         ax.set_yticks([])


         # set the list of proteins on the x axis

         if prot_listx is None:

             if crosslinkedonly:

                 prot_listx = list(self.crosslinkedprots)

             else:

                 prot_listx = list(self.prot_length_dict.keys())

             prot_listx.sort()


         nresx = gap_between_components + \

             sum([self.prot_length_dict[name]

                 + gap_between_components for name in prot_listx])


         # set the list of proteins on the y axis


         if prot_listy is None:

             if crosslinkedonly:

                 prot_listy = list(self.crosslinkedprots)

             else:

                 prot_listy = list(self.prot_length_dict.keys())

             prot_listy.sort()


         nresy = gap_between_components + \

             sum([self.prot_length_dict[name]

                 + gap_between_components for name in prot_listy])


         # this is the residue offset for each protein

         resoffsetx = {}

         resendx = {}

         res = gap_between_components

         for prot in prot_listx:

             resoffsetx[prot] = res

             res += self.prot_length_dict[prot]

             resendx[prot] = res

             res += gap_between_components


         resoffsety = {}

         resendy = {}

         res = gap_between_components

         for prot in prot_listy:

             resoffsety[prot] = res

             res += self.prot_length_dict[prot]

             resendy[prot] = res

             res += gap_between_components


         resoffsetdiagonal = {}

         res = gap_between_components

         for prot in IMP.pmi.tools.OrderedSet(prot_listx + prot_listy):

             resoffsetdiagonal[prot] = res

             res += self.prot_length_dict[prot]

             res += gap_between_components


         # plot protein boundaries


         xticks = []

         xlabels = []

         for n, prot in enumerate(prot_listx):

             res = resoffsetx[prot]

             end = resendx[prot]

             for proty in prot_listy:

                 resy = resoffsety[proty]

                 endy = resendy[proty]

                 ax.plot([res, res], [resy, endy], 'k-', lw=0.4)

                 ax.plot([end, end], [resy, endy], 'k-', lw=0.4)

             xticks.append((float(res) + float(end)) / 2)

             if rename_protein_map is not None:

                 if prot in rename_protein_map:

                     prot = rename_protein_map[prot]

             xlabels.append(prot)


         yticks = []

         ylabels = []

         for n, prot in enumerate(prot_listy):

             res = resoffsety[prot]

             end = resendy[prot]

             for protx in prot_listx:

                 resx = resoffsetx[protx]

                 endx = resendx[protx]

                 ax.plot([resx, endx], [res, res], 'k-', lw=0.4)

                 ax.plot([resx, endx], [end, end], 'k-', lw=0.4)

             yticks.append((float(res) + float(end)) / 2)

             if rename_protein_map is not None:

                 if prot in rename_protein_map:

                     prot = rename_protein_map[prot]

             ylabels.append(prot)


         # plot the contact map

         print(prot_listx, prot_listy)


         if self.contactmap is not None:

             tmp_array = np.zeros((nresx, nresy))


             for px in prot_listx:

                 print(px)

                 for py in prot_listy:

                     print(py)

                     resx = resoffsety[px]

                     lengx = resendx[px] - 1

                     resy = resoffsety[py]

                     lengy = resendy[py] - 1

                     indexes_x = self.index_dictionary[px]

                     minx = min(indexes_x)

                     maxx = max(indexes_x)

                     indexes_y = self.index_dictionary[py]

                     miny = min(indexes_y)

                     maxy = max(indexes_y)


                     print(px, py, minx, maxx, miny, maxy)


                     try:

                         tmp_array[

                             resx:lengx,

                             resy:lengy] = self.contactmap[

                             minx:maxx,

                             miny:maxy]

                     except:  # noqa: E722

                         continue


             ax.imshow(tmp_array,

                       cmap=cm.binary,

                       origin='lower',

                       interpolation='nearest')


         ax.set_xticks(xticks)

         ax.set_xticklabels(xlabels, rotation=90)

         ax.set_yticks(yticks)

         ax.set_yticklabels(ylabels)

         ax.set_xlim(0, nresx)

         ax.set_ylim(0, nresy)


         # set the cross-links


         already_added_xls = []


         for xl in self.crosslinks:


             (r1, c1, r2, c2, mdist, stdv, confidence,

              unique_identifier, descriptor) = xl


             if confidence_classes is not None:

                 if confidence not in confidence_classes:

                     continue


             try:

                 pos1 = r1 + resoffsetx[c1]

             except:  # noqa: E722

                 continue

             try:

                 pos2 = r2 + resoffsety[c2]

             except:  # noqa: E722

                 continue


             if filter is not None:

                 xldb = self.external_csv_data[unique_identifier]

                 xldb.update({"Distance": mdist})

                 xldb.update({"Distance_stdv": stdv})


                 if filter[1] == ">":

                     if float(xldb[filter[0]]) <= float(filter[2]):

                         continue


                 if filter[1] == "<":

                     if float(xldb[filter[0]]) >= float(filter[2]):

                         continue


                 if filter[1] == "==":

                     if float(xldb[filter[0]]) != float(filter[2]):

                         continue


             # all that below is used for plotting the diagonal

             # when you have a rectangolar plots


             pos_for_diagonal1 = r1 + resoffsetdiagonal[c1]

             pos_for_diagonal2 = r2 + resoffsetdiagonal[c2]


             if layout == 'lowerdiagonal':

                 if pos_for_diagonal1 <= pos_for_diagonal2:

                     continue

             if layout == 'upperdiagonal':

                 if pos_for_diagonal1 >= pos_for_diagonal2:

                     continue


             already_added_xls.append((r1, c1, r2, c2))


             if not no_confidence_info:

                 if confidence == '0.01':

                     markersize = 14 * scale_symbol_size

                 elif confidence == '0.05':

                     markersize = 9 * scale_symbol_size

                 elif confidence == '0.1':

                     markersize = 6 * scale_symbol_size

                 else:

                     markersize = 15 * scale_symbol_size

             else:

                 markersize = 5 * scale_symbol_size


             if not no_dist_info:

                 color = self.colormap(mdist)

             else:

                 color = "gray"


             ax.plot(

                 [pos1],

                 [pos2],

                 'o',

                 c=color,

                 alpha=alphablend,

                 markersize=markersize)


         fig.set_size_inches(0.004 * nresx, 0.004 * nresy)


         for i in ax.spines.values():

             i.set_linewidth(2.0)


         if filename:

             plt.savefig(filename + ".pdf", dpi=300, transparent="False")

         else:

             plt.show()


     def get_frequency_statistics(self, prot_list,

                                  prot_list2=None):


         violated_histogram = {}

         satisfied_histogram = {}

         unique_cross_links = []


         for xl in self.unique_cross_link_list:

             (r1, c1, r2, c2, mdist) = xl


             # here we filter by the protein

             if prot_list2 is None:

                 if c1 not in prot_list:

                     continue

                 if c2 not in prot_list:

                     continue

             else:

                 if c1 in prot_list and c2 in prot_list2:

                     pass

                 elif c1 in prot_list2 and c2 in prot_list:

                     pass

                 else:

                     continue


             frequency = self.cross_link_frequency[(r1, c1, r2, c2)]


             if (r1, c1, r2, c2) not in unique_cross_links:

                 if mdist > 35.0:

                     if frequency not in violated_histogram:

                         violated_histogram[frequency] = 1

                     else:

                         violated_histogram[frequency] += 1

                 else:

                     if frequency not in satisfied_histogram:

                         satisfied_histogram[frequency] = 1

                     else:

                         satisfied_histogram[frequency] += 1

                 unique_cross_links.append((r1, c1, r2, c2))

                 unique_cross_links.append((r2, c2, r1, c1))


         print("# satisfied")


         total_number_of_crosslinks = 0


         for i in satisfied_histogram:

             # if i in violated_histogram:

             #   print i, satisfied_histogram[i]+violated_histogram[i]

             # else:

             if i in violated_histogram:

                 print(i, violated_histogram[i]+satisfied_histogram[i])

             else:

                 print(i, satisfied_histogram[i])

             total_number_of_crosslinks += i*satisfied_histogram[i]


         print("# violated")


         for i in violated_histogram:

             print(i, violated_histogram[i])

             total_number_of_crosslinks += i*violated_histogram[i]


         print(total_number_of_crosslinks)


     def print_cross_link_binary_symbols(self, prot_list,

                                         prot_list2=None):

         tmp_matrix = []

         confidence_list = []

         for xl in self.crosslinks:

             (r1, c1, r2, c2, mdist, stdv, confidence,

              unique_identifier, descriptor) = xl


             if prot_list2 is None:

                 if c1 not in prot_list:

                     continue

                 if c2 not in prot_list:

                     continue

             else:

                 if c1 in prot_list and c2 in prot_list2:

                     pass

                 elif c1 in prot_list2 and c2 in prot_list:

                     pass

                 else:

                     continue


             if descriptor != "original":

                 continue


             confidence_list.append(confidence)


             dists = self.cross_link_distances_unique[(r1, c1, r2, c2)]

             tmp_dist_binary = []

             for d in dists:

                 if d < 35:

                     tmp_dist_binary.append(1)

                 else:

                     tmp_dist_binary.append(0)

             tmp_matrix.append(tmp_dist_binary)


         matrix = list(zip(*tmp_matrix))


         satisfied_high_sum = 0

         satisfied_mid_sum = 0

         satisfied_low_sum = 0

         total_satisfied_sum = 0

         for k, m in enumerate(matrix):

             satisfied_high = 0

             total_high = 0

             satisfied_mid = 0

             total_mid = 0

             satisfied_low = 0

             total_low = 0

             total_satisfied = 0

             total = 0

             for n, b in enumerate(m):

                 if confidence_list[n] == "0.01":

                     total_high += 1

                     if b == 1:

                         satisfied_high += 1

                         satisfied_high_sum += 1

                 elif confidence_list[n] == "0.05":

                     total_mid += 1

                     if b == 1:

                         satisfied_mid += 1

                         satisfied_mid_sum += 1

                 elif confidence_list[n] == "0.1":

                     total_low += 1

                     if b == 1:

                         satisfied_low += 1

                         satisfied_low_sum += 1

                 if b == 1:

                     total_satisfied += 1

                     total_satisfied_sum += 1

                 total += 1

             print(k, satisfied_high, total_high)

             print(k, satisfied_mid, total_mid)

             print(k, satisfied_low, total_low)

             print(k, total_satisfied, total)

         print(float(satisfied_high_sum) / len(matrix))

         print(float(satisfied_mid_sum) / len(matrix))

         print(float(satisfied_low_sum) / len(matrix))

 # ------------


     def get_unique_crosslinks_statistics(self, prot_list,

                                          prot_list2=None):


         print(prot_list)

         print(prot_list2)

         satisfied_high = 0

         total_high = 0

         satisfied_mid = 0

         total_mid = 0

         satisfied_low = 0

         total_low = 0

         total = 0

         tmp_matrix = []

         satisfied_string = []

         for xl in self.crosslinks:

             (r1, c1, r2, c2, mdist, stdv, confidence,

              unique_identifier, descriptor) = xl


             if prot_list2 is None:

                 if c1 not in prot_list:

                     continue

                 if c2 not in prot_list:

                     continue

             else:

                 if c1 in prot_list and c2 in prot_list2:

                     pass

                 elif c1 in prot_list2 and c2 in prot_list:

                     pass

                 else:

                     continue


             if descriptor != "original":

                 continue


             total += 1

             if confidence == "0.01":

                 total_high += 1

                 if mdist <= 35:

                     satisfied_high += 1

             if confidence == "0.05":

                 total_mid += 1

                 if mdist <= 35:

                     satisfied_mid += 1

             if confidence == "0.1":

                 total_low += 1

                 if mdist <= 35:

                     satisfied_low += 1

             if mdist <= 35:

                 satisfied_string.append(1)

             else:

                 satisfied_string.append(0)


             dists = self.cross_link_distances_unique[(r1, c1, r2, c2)]

             tmp_dist_binary = []

             for d in dists:

                 if d < 35:

                     tmp_dist_binary.append(1)

                 else:

                     tmp_dist_binary.append(0)

             tmp_matrix.append(tmp_dist_binary)


         print("unique satisfied_high/total_high", satisfied_high, "/",

               total_high)

         print("unique satisfied_mid/total_mid", satisfied_mid, "/", total_mid)

         print("unique satisfied_low/total_low", satisfied_low, "/", total_low)

         print("total", total)


         matrix = list(zip(*tmp_matrix))

         satisfied_models = 0

         satstr = ""

         for b in satisfied_string:

             if b == 0:

                 satstr += "-"

             if b == 1:

                 satstr += "*"


         for m in matrix:

             all_satisfied = True

             string = ""

             for n, b in enumerate(m):

                 if b == 0:

                     string += "0"

                 if b == 1:

                     string += "1"

                 if b == 1 and satisfied_string[n] == 1:

                     pass

                 elif b == 1 and satisfied_string[n] == 0:

                     pass

                 elif b == 0 and satisfied_string[n] == 0:

                     pass

                 elif b == 0 and satisfied_string[n] == 1:

                     all_satisfied = False

             if all_satisfied:

                 satisfied_models += 1

             print(string)

             print(satstr, all_satisfied)

         print("models that satisfies the median satisfied cross-links/total "

               "models", satisfied_models, len(matrix))


     def plot_matrix_cross_link_distances_unique(self, figurename, prot_list,

                                                 prot_list2=None):


         import matplotlib as mpl

         mpl.use('Agg')

         import matplotlib.pylab as pl


         tmp_matrix = []

         for kw in self.cross_link_distances_unique:

             (r1, c1, r2, c2) = kw

             dists = self.cross_link_distances_unique[kw]


             if prot_list2 is None:

                 if c1 not in prot_list:

                     continue

                 if c2 not in prot_list:

                     continue

             else:

                 if c1 in prot_list and c2 in prot_list2:

                     pass

                 elif c1 in prot_list2 and c2 in prot_list:

                     pass

                 else:

                     continue

             # append the sum of dists to order by that in the matrix plot

             dists.append(sum(dists))

             tmp_matrix.append(dists)


         tmp_matrix.sort(key=itemgetter(len(tmp_matrix[0]) - 1))


         # print len(tmp_matrix),  len(tmp_matrix[0])-1

         matrix = np.zeros((len(tmp_matrix), len(tmp_matrix[0]) - 1))


         for i in range(len(tmp_matrix)):

             for k in range(len(tmp_matrix[i]) - 1):

                 matrix[i][k] = tmp_matrix[i][k]


         print(matrix)


         fig = pl.figure()

         ax = fig.add_subplot(211)


         cax = ax.imshow(matrix, interpolation='nearest')

         fig.colorbar(cax)

         pl.savefig(figurename, dpi=300)

         pl.show()


     def plot_bars(

         self,

         filename,

         prots1,

         prots2,

         nxl_per_row=20,

         arrangement="inter",

             confidence_input="None"):


         data = []

         for xl in self.cross_link_distances:

             (r1, c1, r2, c2, mdist, confidence) = xl

             if c1 in prots1 and c2 in prots2:

                 if arrangement == "inter" and c1 == c2:

                     continue

                 if arrangement == "intra" and c1 != c2:

                     continue

                 if confidence_input == confidence:

                     label = str(c1) + ":" + str(r1) + \

                         "-" + str(c2) + ":" + str(r2)

                     values = self.cross_link_distances[xl]

                     frequency = self.cross_link_frequency[(r1, c1, r2, c2)]

                     data.append((label, values, mdist, frequency))


         sort_by_dist = sorted(data, key=lambda tup: tup[2])

         sort_by_dist = list(zip(*sort_by_dist))

         values = sort_by_dist[1]

         positions = list(range(len(values)))

         labels = sort_by_dist[0]

         frequencies = list(map(float, sort_by_dist[3]))

         frequencies = [f * 10.0 for f in frequencies]


         nchunks = int(float(len(values)) / nxl_per_row)

         values_chunks = IMP.pmi.tools.chunk_list_into_segments(values, nchunks)

         positions_chunks = IMP.pmi.tools.chunk_list_into_segments(

             positions,

             nchunks)

         frequencies_chunks = IMP.pmi.tools.chunk_list_into_segments(

             frequencies,

             nchunks)

         labels_chunks = IMP.pmi.tools.chunk_list_into_segments(labels, nchunks)


         for n, v in enumerate(values_chunks):

             p = positions_chunks[n]

             f = frequencies_chunks[n]

             IMP.pmi.output.plot_fields_box_plots(

                 filename + "." + str(n), v, p, f,

                 valuename="Distance (Ang)",

                 positionname="Unique " + arrangement + " Crosslinks",

                 xlabels=labels_chunks[n])


     def crosslink_distance_histogram(self, filename,

                                      prot_list=None,

                                      prot_list2=None,

                                      confidence_classes=None,

                                      bins=40,

                                      color='#66CCCC',

                                      yplotrange=[0, 1],

                                      format="png",

                                      normalized=False):

         if prot_list is None:

             prot_list = list(self.prot_length_dict.keys())


         distances = []

         for xl in self.crosslinks:

             (r1, c1, r2, c2, mdist, stdv, confidence,

              unique_identifier, descriptor) = xl


             if confidence_classes is not None:

                 if confidence not in confidence_classes:

                     continue


             if prot_list2 is None:

                 if c1 not in prot_list:

                     continue

                 if c2 not in prot_list:

                     continue

             else:

                 if c1 in prot_list and c2 in prot_list2:

                     pass

                 elif c1 in prot_list2 and c2 in prot_list:

                     pass

                 else:

                     continue


             distances.append(mdist)


         IMP.pmi.output.plot_field_histogram(

             filename, distances, valuename="C-alpha C-alpha distance [Ang]",

             bins=bins, color=color,

             format=format,

             reference_xline=35.0,

             yplotrange=yplotrange, normalized=normalized)


     def scatter_plot_xl_features(self, filename,

                                  feature1=None,

                                  feature2=None,

                                  prot_list=None,

                                  prot_list2=None,

                                  yplotrange=None,

                                  reference_ylines=None,

                                  distance_color=True,

                                  format="png"):

         import matplotlib as mpl

         mpl.use('Agg')

         import matplotlib.pyplot as plt


         fig = plt.figure(figsize=(10, 10))

         ax = fig.add_subplot(111)


         for xl in self.crosslinks:

             (r1, c1, r2, c2, mdist, stdv, confidence,

              unique_identifier, arrangement) = xl


             if prot_list2 is None:

                 if c1 not in prot_list:

                     continue

                 if c2 not in prot_list:

                     continue

             else:

                 if c1 in prot_list and c2 in prot_list2:

                     pass

                 elif c1 in prot_list2 and c2 in prot_list:

                     pass

                 else:

                     continue


             xldb = self.external_csv_data[unique_identifier]

             xldb.update({"Distance": mdist})

             xldb.update({"Distance_stdv": stdv})


             xvalue = float(xldb[feature1])

             yvalue = float(xldb[feature2])


             if distance_color:

                 color = self.colormap(mdist)

             else:

                 color = "gray"


             ax.plot([xvalue], [yvalue], 'o', c=color, alpha=0.1, markersize=7)


         if yplotrange is not None:

             ax.set_ylim(yplotrange)

         if reference_ylines is not None:

             for rl in reference_ylines:

                 ax.axhline(rl, color='red', linestyle='dashed', linewidth=1)


         if filename:

             plt.savefig(filename + "." + format, dpi=150, transparent="False")


         plt.show()

IMP::pmi.analysis.RMSD.get_output
def get_output
Returns output for IMP.pmi.output.Output object.
Definition: pmi/Analysis.py:579

IMP::algebra::Transformation3D
Simple 3D transformation class.
Definition: Transformation3D.h:37

IMP::pmi.analysis.CrossLinkTable
Visualization of cross-links.
Definition: pmi/Analysis.py:1630

IMP::atom::Fragment
A decorator to associate a particle with a part of a protein/DNA/RNA.
Definition: Fragment.h:20

IMP::atom::get_drms
double get_drms(const Vector3DsOrXYZs0 &m1, const Vector3DsOrXYZs1 &m2)
Definition: atom/distance.h:147

IMP::pmi.output.ProcessOutput
A class for reading stat files (either rmf or ascii v1 and v2)
Definition: output.py:931

IMP::rmf::create_hierarchies
atom::Hierarchies create_hierarchies(RMF::FileConstHandle fh, Model *m)

IMP::pmi.output.plot_field_histogram
def plot_field_histogram
Plot a list of histograms from a value list.
Definition: output.py:1754

IMP::pmi.output.plot_fields_box_plots
def plot_fields_box_plots
Plot time series as boxplots.
Definition: output.py:1819

IMP::atom::get_drmsd
double get_drmsd(const Vector3DsOrXYZs0 &m0, const Vector3DsOrXYZs1 &m1)
Calculate distance the root mean square deviation between two sets of 3D points.
Definition: atom/distance.h:49

IMP::pmi.tools.get_molecules
def get_molecules
This function returns the parent molecule hierarchies of given objects.
Definition: tools.py:1159

IMP::pmi.tools
Miscellaneous utilities.
Definition: tools.py:1

IMP::pmi.analysis.Precision.__init__
def __init__
Constructor.
Definition: pmi/Analysis.py:606

IMP::handle_use_deprecated
void handle_use_deprecated(std::string message)
Break in this method in gdb to find deprecated uses at runtime.

IMP::algebra::get_weighted_rmsd
double get_weighted_rmsd(const Vector3DsOrXYZs0 &m1, const Vector3DsOrXYZs1 &m2, const Floats &weights)
Definition: algebra/distance.h:83

IMP::pmi.analysis.RMSD
Compute the RMSD (without alignment) taking into account the copy ambiguity.
Definition: pmi/Analysis.py:461

IMP::em::MRCReaderWriter
Definition: MRCReaderWriter.h:20

IMP::rmf::link_restraints
void link_restraints(RMF::FileConstHandle fh, const Restraints &hs)

IMP::pmi.analysis.Precision
A class to evaluate the precision of an ensemble.
Definition: pmi/Analysis.py:595

log
Definition: log.py:1

IMP::pmi.analysis.Clustering
A class to cluster structures.
Definition: pmi/Analysis.py:183

IMP::pmi.analysis.GetModelDensity.__init__
def __init__
Constructor.
Definition: pmi/Analysis.py:1165

IMP::pmi.analysis.Precision.get_rmsd_wrt_reference_structure_with_alignment
def get_rmsd_wrt_reference_structure_with_alignment
First align then calculate RMSD.
Definition: pmi/Analysis.py:1062

IMP::pmi.analysis.Alignment.__init__
def __init__
Constructor.
Definition: pmi/Analysis.py:30

IMP::em::SampledDensityMap
Class for sampling a density map from particles.
Definition: SampledDensityMap.h:31

IMP::atom::Copy
A decorator for keeping track of copies of a molecule.
Definition: Copy.h:28

IMP::pmi.analysis.Precision.add_structure
def add_structure
Read a structure into the ensemble and store (as coordinates).
Definition: pmi/Analysis.py:688

IMP::em::multiply
DensityMap * multiply(const DensityMap *m1, const DensityMap *m2)
Return a density map for which voxel i contains the result of m1[i]*m2[i].

IMP::pmi.analysis.Alignment
Performs alignment and RMSD calculation for two sets of coordinates.
Definition: pmi/Analysis.py:21

IMP::pmi.analysis.GetModelDensity.add_subunits_density
def add_subunits_density
Add a frame to the densities.
Definition: pmi/Analysis.py:1187

IMP::em::create_density_map
DensityMap * create_density_map(const IMP::algebra::GridD< 3, S, V, E > &arg)
Create a density map from an arbitrary IMP::algebra::GridD.
Definition: DensityMap.h:678

IMP::pmi.tools.scatter_and_gather
def scatter_and_gather
Synchronize data over a parallel run.
Definition: tools.py:542

IMP::algebra::get_rmsd
double get_rmsd(const Vector3DsOrXYZs0 &m1, const Vector3DsOrXYZs1 &m2)
Definition: algebra/distance.h:47

IMP::em
Basic utilities for handling cryo-electron microscopy 3D density maps.

IMP::rmf::load_frame
void load_frame(RMF::FileConstHandle file, RMF::FrameID frame)
Load the given RMF frame into the state of the linked objects.

IMP::core::XYZ
A decorator for a particle with x,y,z coordinates.
Definition: XYZ.h:30

IMP::pmi.analysis.get_particles_at_resolution_one
def get_particles_at_resolution_one
Get particles at res 1, or any beads, based on the name.
Definition: pmi/Analysis.py:1595

IMP::pmi.analysis.Clustering.fill
def fill
Add coordinates for a single model.
Definition: pmi/Analysis.py:209

IMP::pmi.analysis
Tools for clustering and cluster analysis.
Definition: pmi/Analysis.py:1

IMP::pmi.analysis.Clustering.do_cluster
def do_cluster
Run K-means clustering.
Definition: pmi/Analysis.py:252

IMP::algebra::Rotation3D
3D rotation class.
Definition: Rotation3D.h:52

IMP::em::get_bounding_box
algebra::BoundingBoxD< 3 > get_bounding_box(const DensityMap *m)
Definition: DensityMap.h:509

IMP::algebra::get_identity_transformation_3d
Transformation3D get_identity_transformation_3d()
Return a transformation that does not do anything.
Definition: Transformation3D.h:134

IMP::pmi.output
Classes for writing output files and processing them.
Definition: output.py:1

IMP::pmi.analysis.Precision.set_reference_structure
def set_reference_structure
Read in a structure used for reference computation.
Definition: pmi/Analysis.py:1046

IMP::pmi.analysis.Precision.get_rmsf
def get_rmsf
Calculate the residue mean square fluctuations (RMSF).
Definition: pmi/Analysis.py:987

IMP::pmi.analysis.Precision.get_average_distance_wrt_reference_structure
def get_average_distance_wrt_reference_structure
Compare the structure set to the reference structure.
Definition: pmi/Analysis.py:1107

IMP::pmi.analysis.GetModelDensity.get_density
def get_density
Get the current density for some component name.
Definition: pmi/Analysis.py:1251

IMP::algebra
General purpose algebraic and geometric methods that are expected to be used by a wide variety of IMP...

IMP::Exception
The general base class for IMP exceptions.
Definition: exception.h:48

IMP::pmi.analysis.Clustering.__init__
def __init__
Constructor.
Definition: pmi/Analysis.py:188

IMP::pmi.analysis.Precision.get_precision
def get_precision
Evaluate the precision of two named structure groups.
Definition: pmi/Analysis.py:862

IMP::algebra::Vector3D
VectorD< 3 > Vector3D
Definition: VectorD.h:408

IMP::rmf::create_restraints
Restraints create_restraints(RMF::FileConstHandle fh, Model *m)

IMP::pmi.analysis.RMSD.__init__
def __init__
Definition: pmi/Analysis.py:474

IMP::pmi.analysis.RMSD.get_moldict_coorddict
def get_moldict_coorddict
return data structure for the RMSD calculation
Definition: pmi/Analysis.py:495

IMP::rmf::link_hierarchies
void link_hierarchies(RMF::FileConstHandle fh, const atom::Hierarchies &hs)

IMP::atom::get_copy_index
int get_copy_index(Hierarchy h)
Walk up the hierarchy to find the current copy index.

IMP::pmi.analysis.get_particles_at_resolution_ten
def get_particles_at_resolution_ten
Get particles at res 10, or any beads, based on the name.
Definition: pmi/Analysis.py:1613

IMP::pmi
Python classes to represent, score, sample and analyze models.

IMP::pmi.analysis.Precision.add_structures
def add_structures
Read a list of RMFs, supports parallel.
Definition: pmi/Analysis.py:740

IMP::algebra::get_transformation_aligning_first_to_second
Transformation3D get_transformation_aligning_first_to_second(Vector3Ds a, Vector3Ds b)

IMP::atom::get_leaves
Hierarchies get_leaves(const Selection &h)

IMP::atom::get_drmsd_Q
double get_drmsd_Q(const Vector3DsOrXYZs0 &m0, const Vector3DsOrXYZs1 &m1, double threshold)
Definition: atom/distance.h:85

IMP::atom::Selection
Select hierarchy particles identified by the biological name.
Definition: Selection.h:70

IMP::pmi.analysis.GetModelDensity
Compute mean density maps from structures.
Definition: pmi/Analysis.py:1154

IMP::rmf
Support for the RMF file format for storing hierarchical molecular data and markup.

IMP::pmi.tools.get_residue_indexes
def get_residue_indexes
Retrieve the residue indexes for the given particle.
Definition: tools.py:499

IMP::core::XYZR
A decorator for a particle with x,y,z coordinates and a radius.
Definition: XYZR.h:27