IMP logo
IMP Reference Guide  develop.d97d4ead1f,2024/11/21
The Integrative Modeling Platform
prepare_protein_library.py
1 """@namespace IMP.spatiotemporal.prepare_protein_library
2  Function for preparing spatiotemporal models for sampling.
3 """
4 import numpy as np
5 import itertools
6 from IMP.spatiotemporal import composition_scoring
7 import os
8 
9 
10 def prepare_protein_library(times, exp_comp_map, expected_subcomplexes,
11  nmodels, output_dir='', template_topology='',
12  template_dict={}, match_final_state=True):
13  """
14  Function that reads in experimental stoicheometery data and calculates
15  which compositions and location assignments should be sampled for
16  spatiotemporal modeling, which are saved as config files. Optionally,
17  a PMI topology file can be provided, in which case topology files
18  for each composition and location assignment are also written.
19  The output is 3 types of files:
20  1. *_time.config - configuration files, which list the proteins
21  included at each time point for each model
22  2. time.txt - protein copy number files. Each row is a protein
23  copy number state and each column is the
24  protein copy number in that state. Note that each
25  protein copy number state can result in multiple
26  location assignments.
27  3. *_time_topol.txt - topology files for each copy number
28  and location assignment.
29 
30  @param times: list of strings, the times at which the stoicheometery
31  data should be read.
32  @param exp_comp_map: dictionary, which describes protein
33  stoicheometery.
34  The key describes the protein, which should correspond to names
35  within the expected_subcomplexes. Only copy numbers for proteins
36  or subcomplexes included in this dictionary will be scored. For
37  each of these proteins, a csv file should be provided with protein
38  copy number data. The csv file should have 3 columns,
39  1) "Time", which matches up to the possible times in the graph,
40  2) "mean", the average protein copy number at that time point
41  from experiment, and 3) "std", the standard deviation of that
42  protein copy number from experiment.
43  @param expected_subcomplexes: list of all possible subcomplex strings
44  in the model. Should be a list without duplicates of
45  all components in the subcomplex configuration files.
46  @param nmodels: int, number of models with different protein copy
47  numbers to generate at each time point.
48  @param output_dir: string, directory where the output will be written.
49  Empty string assumes the current working directory.
50  @param template_topology: string, name of the topology file
51  for the complete complex.
52  (default: '', no topology files are output)
53  @param template_dict: dictionary for connecting the spatiotemporal
54  model to the topology file. The keys (string) are the names of
55  the proteins, defined by the expected_complexes variable.
56  The values (list) are the names of all proteins in the topology
57  file that should have the same copy number
58  as the labeled protein, specifically the "molecule_name."
59  (default: {}, no topology files are output)
60  @param match_final_state: Boolean, determines whether to fix the
61  final state to the state defined by expected_subcomplexes.
62  True enforces this match and thus ensures that the final
63  time has only one state.
64  (default: True)
65  """
66  # Assert that all inputs are the correct variable type
67  if not isinstance(times, list):
68  raise TypeError("times should be of type list")
69  if not isinstance(exp_comp_map, dict):
70  raise TypeError("times should be of type dict")
71  if not isinstance(expected_subcomplexes, list):
72  raise TypeError("nmodels should be of type list")
73  if not isinstance(nmodels, int):
74  raise TypeError("nmodels should be of type int")
75  if not isinstance(output_dir, str):
76  raise TypeError("output_dir should be of type str")
77  if not isinstance(template_topology, str):
78  raise TypeError("template_topology should be of type str")
79  if not isinstance(template_dict, dict):
80  raise TypeError("template_dict should be of type dict")
81  if not isinstance(match_final_state, bool):
82  raise TypeError("match_final_state should be of type bool")
83  # make output_dir if necessary
84  if len(output_dir) > 0:
85  if os.path.exists(output_dir):
86  pass
87  else:
88  os.mkdir(output_dir)
89  os.chdir(output_dir)
90  # Whether or not topology files should be written
91  include_topology = False
92  # calculate final copy numbers based on the expected complexes
93  final_CN = np.zeros(len(exp_comp_map.keys()), dtype=int)
94  for i, key in enumerate(exp_comp_map.keys()):
95  for subcomplex in expected_subcomplexes:
96  if key in subcomplex:
97  final_CN[i] += 1
98  # Enumerate all possible compositions (independent of time)
99  # start by computing the range for each protein copy number
100  copy_num = []
101  for CN in final_CN:
102  copy_num.append(range(CN+1))
103  # Convert the range of protein copy numbers to all possible
104  # combinations of states
105  all_library = [iterable for iterable in itertools.product(*copy_num)]
106  # remove empty state
107  empty_state = []
108  for i in range(len(final_CN)):
109  empty_state.append(0)
110  all_library.pop(all_library.index(tuple(empty_state)))
111 
112  # compare the complete composition library against the composition model
113  # at each time and extract top scoring compositions
114  for time in times:
115  # if match_final_state is True, restrict the final state to match
116  # the known copy numbers
117  if time == times[len(times)-1] and match_final_state:
118  olist = [list(final_CN)]
119  state_list = []
120  # Keep all expected complexes
121  state_list.append(expected_subcomplexes)
122  # otherwise, calculate the nmodels most likely copy numbers
123  # for each time point
124  else:
125  # calculate the weights for each state
126  unnormalized_weights = []
127  for state in all_library:
128  unnormalized_weights.append(
129  composition_scoring.calc_likelihood_state(
130  exp_comp_map, time, state))
131  unw = np.array(unnormalized_weights)
132  # get top scoring nmodels
133  mindx = np.argsort(unw)[0:nmodels]
134  # write out library with the top scoring models
135  olist = []
136  state_list = []
137  for m in mindx:
138  state = all_library[m]
139  # convert state counts to protein list
140  olist.append(list(state))
141  # Loops over each copy number state
142  for state in olist:
143  cn_list = []
144  # Loops over each protein
145  for i, cn in enumerate(state):
146  sub_list = []
147  # initiate found_subcomplex
148  found_subcomplex = []
149  for subcomplex in expected_subcomplexes:
150  # See which subcomplexes this protein appears in
151  if list(exp_comp_map.keys())[i] in subcomplex:
152  found_subcomplex.append(subcomplex)
153  # find proteins using combinatorics
154  prots = list(itertools.combinations(found_subcomplex,
155  state[i]))
156  for prot in prots:
157  sub_list.append(prot)
158  if len(sub_list[0]) > 0:
159  cn_list.append(sub_list)
160  # combine the list of copy numbers
161  all_cn = [iterable for iterable in itertools.product(*cn_list)]
162  # format the copy number list from a list (all copy number
163  # combinations) of tuples (copy number of 1
164  # state) of tuples (name of 1 protein) to a list
165  # (all copies) of lists (all proteins)
166  # loop over all cn
167  for cn in all_cn:
168  cn_list2 = []
169  # select each copy number
170  for n in cn:
171  # append each protein in each copy number to list
172  for prot in n:
173  cn_list2.append(prot)
174  state_list.append(cn_list2)
175  # write top "scoring" compositions to file
176  oary = np.array(olist, dtype=int)
177  header = ''
178  for prot_name in exp_comp_map.keys():
179  header = header+str(prot_name)+'\t\t\t\t'
180  np.savetxt(time + ".txt", oary, header=header)
181 
182  # write protein config library to file
183  for indx, prot_list in enumerate(state_list):
184  with open(str(indx + 1) + "_" + time + ".config", "w") as fh:
185  for prot in prot_list:
186  fh.write(prot + "\n")
187 
188  if len(template_topology) > 0:
189  include_topology = True
190  # write topology file for each state
191  for indx, prot_list in enumerate(state_list):
192  # Names of proteins to keep, according to template file
193  keep_prots = []
194  # loop over each entry of prot_list and
195  # convert to the template file name
196  for prot in prot_list:
197  if prot in template_dict.keys():
198  keep_prots.extend(template_dict[prot])
199  else:
200  raise KeyError("Protein " + prot
201  + ' does not exist in template_dict'
202  '\nClosing...')
203  # open new topology file
204  with open(str(indx + 1) + "_" + time +
205  "_topol.txt", "w") as fh:
206  old = open(template_topology, 'r')
207  line = old.readline()
208  while line:
209  line_split = line.split('|')
210  # keep blank lines
211  if len(line_split) < 2:
212  fh.write(line)
213  else:
214  # Keep lines where the protein name
215  # is in the template_dict
216  if line_split[1] in keep_prots:
217  fh.write(line)
218  # Keep instruction line
219  elif line_split[1] == 'molecule_name ':
220  fh.write(line)
221  line = old.readline()
222  old.close()
223  fh.close()
224  if include_topology:
225  print('Successfully calculated the most likely configurations,'
226  ' and saved them to configuration and topology '
227  'files.')
228  else:
229  print('Successfully calculated the most likely configurations,'
230  ' and saved them to configuration files.')
Spatialtemporal scoring in IMP.
def prepare_protein_library
Function that reads in experimental stoicheometery data and calculates which compositions and locatio...