IMP logo
IMP Reference Guide  develop.d97d4ead1f,2024/11/21
The Integrative Modeling Platform
composition_scoring.py
1 """@namespace IMP.spatiotemporal.composition_scoring
2  Functions for weighting graphNode objects based on stoichiometry data.
3 """
4 import numpy as np
5 import warnings
6 import os
7 
8 
9 def get_state(subcomplex_components, prot):
10  """
11  function to calculate how many times a protein appears in a list of
12  proteins, which can be accessed from a graphNode object using
13  node.get_subcomplex_components()
14 
15  @param subcomplex_components: subcomplexes or components in a given node,
16  which can be accessed by graphNode.get_subcomplex_components()
17  @param prot: string, protein or subcomplex we are interested in finding
18  @return state, int, number of times the protein or subcomplex appears
19  in subcomplex_components
20  """
21  state = 0
22  for subcomplex in subcomplex_components:
23  if prot in subcomplex:
24  state += 1
25  return state
26 
27 
28 def composition_likelihood_function(mean, std, prots, node):
29  """Function that calculates the likelihood of an individual node, used by
30  calc_likelihood().
31 
32  @param mean: dictionary of dictionaries where the first key is the protein,
33  the second key is the time, and the expected mean copy number
34  from experiment is returned.
35  @param std: dictionary of dictionaries where the first key is the protein,
36  the second key is the time, and the expected standard deviation
37  of protein copy number from experiment is returned.
38  @param prots: list of proteins or subcomplexes which will be scored
39  according to this likelihood function
40  @param node: the graphNode object for which the likelihood will be
41  calculated.
42  @return w: float, the weight of the graphNode according to the composition
43  likelihood function.
44  """
45  # get time
46  t = node.get_time()
47  w = 0
48  for prot in prots:
49  # x counts the number of proteins of a given type in the node
50  x = get_state(node.get_subcomplex_components(), prot)
51  # check std is greater than 0
52  if std[prot][t] > 0:
53  pass
54  else:
55  warnings.warn(
56  'WARNING!!! Standard deviation of protein ' + prot
57  + ' 0 or less at time ' + t
58  + '. May lead to illogical results.')
59  w += (0.5 * ((x - mean[prot][t]) / std[prot][t])**2
60  + np.log(std[prot][t] * np.sqrt(2 * np.pi)))
61  return w
62 
63 
64 def calc_likelihood(exp_comp_map, nodes):
65  """
66  Function that adds a score for the compositional likelihood for all
67  states represented as nodes in the graph. The composition likelihood
68  assumes a Gaussian distribution for copy number of each protein or
69  subcomplex with means and standard deviatiations derived from experiment.
70  Returns the nodes, with the new weights added.
71 
72  @param exp_comp_map: dictionary, which describes protein stoicheometery.
73  The key describes the protein, which should correspond to names
74  within the expected_subcomplexes. Only copy numbers for proteins
75  or subcomplexes included in this dictionary will be scored. For
76  each of these proteins, a csv file should be provided with protein
77  copy number data. The csv file should have 3 columns,
78  1) "Time", which matches up to the possible times in the graph,
79  2) "mean", the average protein copy number at that time point
80  from experiment, and 3) "std", the standard deviation of that
81  protein copy number from experiment.
82  @param nodes: list of graphNode objects, which have been already been
83  initiated with static scores
84  @return nodes: edited list of graphNode objects, which now have static
85  and composition scores
86  """
87  import pandas as pd
88  # Get list of all all proteins
89  prots = list(exp_comp_map.keys())
90  # Data is stored as a dictionary of dictionaries. The first dictionary
91  # references which protein you are referring to.
92  # the 2nd dictionary references which time you are referring to. The return
93  # is the mean or standard deviation of the protein copy number
94  mean = {}
95  std = {}
96  # import csv file as pandas data frame
97  for prot in prots:
98  prot_dict_mean = {}
99  prot_dict_std = {}
100  if os.path.exists(exp_comp_map[prot]):
101  exp = pd.read_csv(exp_comp_map[prot])
102  else:
103  raise FileNotFoundError(
104  "Error!!! Check exp_comp_map. Unable to find composition "
105  "file: " + exp_comp_map[prot] + '\nClosing...')
106  for i in range(len(exp)):
107  prot_dict_mean[exp['Time'][i]] = exp['mean'][i]
108  prot_dict_std[exp['Time'][i]] = exp['std'][i]
109  mean[prot] = prot_dict_mean
110  std[prot] = prot_dict_std
111  # loop over all nodes and calculate the likelihood for each noe
112  for node in nodes:
113  # compute the compositional likelihood of the nodes
114  weight = composition_likelihood_function(mean, std, prots, node)
115  # add state weight to node
116  node.add_score(float(weight))
117  return nodes
118 
119 
120 def calc_likelihood_state(exp_comp_map, t, state):
121  """
122  Function that adds a score for the compositional likelihood for all
123  states, similar to how composition_likelihood_function calculates the
124  composition likelihood of a node. Used by prepare_protein_library.
125  The composition likelihood assumes a Gaussian distribution for copy
126  number of each protein or subcomplex with means and standard
127  deviatiations derived from experiment. Returns the nodes, with the
128  new weights added.
129 
130  @param exp_comp_map: dictionary, which describes protein stoicheometery.
131  The key describes the protein, which should correspond to names
132  within the expected_subcomplexes. Only copy numbers for proteins
133  or subcomplexes included in this dictionary will be scored. For
134  each of these proteins, a csv file should be provided with protein
135  copy number data. The csv file should have 3 columns,
136  1) "Time", which matches up to the possible times in the graph,
137  2) "mean", the average protein copy number at that time point
138  from experiment, and 3) "std", the standard deviation of that
139  protein copy number from experiment.
140  @param t: string, time at which the composition likelihood should be
141  calculated. Should match one a possible value in the first column
142  of the exp_comp_map.
143  @param state: list of integers, an array of the number of protein copy
144  numbers for which the likelihood will be calculated.
145  This array should list the proteins in the same order as
146  the exp_comp_map.
147  @return weight: float, the weight of the graphNode according to the
148  composition likelihood function.
149  """
150  import pandas as pd
151  # Data is stored as a dictionary of dictionaries. The first dictionary
152  # references which protein you are referring to.
153  # the 2nd dictionary references which time you are referring to. The return
154  # is the mean or standard deviation of the protein copy number
155  mean = {}
156  std = {}
157  state_cn = {}
158  count = 0
159  # import csv file as pandas data frame
160  for prot in exp_comp_map.keys():
161  prot_dict_mean = {}
162  prot_dict_std = {}
163  state_cn[prot] = state[count]
164  if os.path.exists(exp_comp_map[prot]):
165  exp = pd.read_csv(exp_comp_map[prot])
166  else:
167  raise FileNotFoundError(
168  "Error!!! Check exp_comp_map. Unable to find composition "
169  "file: " + exp_comp_map[prot] + '\nClosing...')
170  for i in range(len(exp)):
171  prot_dict_mean[exp['Time'][i]] = exp['mean'][i]
172  prot_dict_std[exp['Time'][i]] = exp['std'][i]
173  mean[prot] = prot_dict_mean
174  std[prot] = prot_dict_std
175  count += 1
176  # compute the compositional likelihood of the nodes
177  weight = 0
178  for prot in exp_comp_map.keys():
179  # x counts the number of proteins of a given type in the node
180  x = state_cn[prot]
181  # check std is greater than 0
182  if std[prot][t] > 0:
183  pass
184  else:
185  warnings.warn(
186  'WARNING!!! Standard deviation of protein ' + prot
187  + ' 0 or less at time ' + t
188  + '. May lead to illogical results.')
189  weight += (0.5 * ((x - mean[prot][t]) / std[prot][t]) ** 2 +
190  np.log(std[prot][t] * np.sqrt(2 * np.pi)))
191  return weight
def composition_likelihood_function
Function that calculates the likelihood of an individual node, used by calc_likelihood().
def get_state
function to calculate how many times a protein appears in a list of proteins, which can be accessed f...
def calc_likelihood
Function that adds a score for the compositional likelihood for all states represented as nodes in th...
def calc_likelihood_state
Function that adds a score for the compositional likelihood for all states, similar to how compositio...