IMP logo
IMP Reference Guide  develop.031dafb4d2,2024/05/16
The Integrative Modeling Platform
create_DAG.py
1 """@namespace IMP.spatiotemporal.create_DAG
2  Simplified function for creating a spatiotemporal model.
3 """
4 import os
5 import itertools
6 import warnings
7 from IMP.spatiotemporal import graphNode
8 from IMP.spatiotemporal.score_graph import score_graph
9 from IMP.spatiotemporal import write_output
10 from IMP.spatiotemporal import composition_scoring
11 
12 
13 def create_DAG(state_dict,
14  # optional inputs related to model input / calculation
15  input_dir='', scorestr='_scores.log', output_dir='',
16  # optional inputs related to spatiotemporal scoring
17  # (only allowing associative transitions).
18  spatio_temporal_rule=False, subcomplexstr='.config',
19  expected_subcomplexes=[],
20  # optional inputs related to composition scores
21  score_comp=False, exp_comp_map={},
22  # optional inputs related to model output
23  out_cdf=True, out_labeled_pdf=True, out_pdf=False, npaths=0,
24  # optional inputs related to DAG output
25  draw_dag=True):
26  """
27  This functions streamlines the process of creating a graph by performing
28  all the necessary steps and saving relevant input to files. Features of
29  this function are walked through in
30  example/toy/Simple_spatiotemporal_example.py
31 
32  @param state_dict: dictionary that defines the spatiotemporal model.
33  The keys are strings that correspond to each time point in the
34  stepwise temporal process. Keys should be ordered according to the
35  steps in the spatiotemporal process. The values are integers that
36  correspond to the number of possible states at that timepoint.
37  Scores for each model are expected to be stored as
38  $state_$timescorestr, where state are integers 1->value of the
39  dictionary, time is the key in the dictionary, and scorestr is
40  trailing characters, which are assumed to be constant for
41  all states.
42  @param input_dir: string, directory where the data is stored. Empty string
43  assumes current working directory.
44  @param scorestr: string, trailing characters at the end of the file with
45  scores for each stage of the spatiotemporal model
46  (default: '_scores.log').
47  @param output_dir: string, directory where the output will be written.
48  Empty string assumes the same directory as the input_dir.
49  @param spatio_temporal_rule: Boolean. If true, enforces that all components
50  earlier in the assembly process are present later in the process.
51  (default: False)
52  @param subcomplexstr: string, trailing characters after the subcomplex
53  file, which is a list of subcomplexes included in the given
54  label/time (default: '.config')
55  @param expected_subcomplexes: list of all possible subcomplex strings
56  in the model (default: []) Should be a list without duplicates of
57  all components in the subcomplex files.
58  @param score_comp: Boolean to determine whether or not to score models
59  based on the protein composition.
60  @param exp_comp_map: dictionary for determining protein composition score.
61  The keys are the proteins. The code checks if the name of these
62  proteins are within the subcomplex_components for each node.
63  As such, the naming scheme should be such that the keys of
64  exp_comp_map are substrings of expected_subcomplexes the values of
65  exp_comp_map should correspond to a csv file for each subcomplex
66  with protein copy numbers. Each csv file should have 3 columns:
67  1) 'Time' - should correspond to the keys of state_dict,
68  2) 'mean' - mean copy number from experimental data, and
69  3) std - standard deviation from experimental data
70  @param out_cdf: Boolean to determine whether or not to write out the
71  cumulative distribution function (cdf) for the graph
72  (default: True). filename: "cdf.txt"
73  @param out_labeled_pdf: Boolean to determine whether to output the
74  labeled pdf file, which includes both the pdf and the ordered
75  states visited along each path (default: True).
76  filename: "labeled_pdf.txt"
77  @param out_pdf: Boolean to determine whether or not to write out the
78  probability distribution function (pdf) for the graph
79  (default: False) filename: "pdf.txt"
80  @param npaths: int, write out the states along the n most likely paths,
81  based on the pdf (default: 0) filename: "pathXX.txt", where XX
82  is the number of the path
83  @param draw_dag: Boolean to determine whether or not to write out a
84  directed acyclic graph (dag) to a file (default: True)
85  filename: "dag_heatmap"
86  @return nodes: list of graphNode objects, corresponding to the snapshot
87  models in the spatiotemporal model
88  @return graph: list of all paths through the graph. Each path is a list
89  of graphNode objects that correspond to the states visited
90  along the path.
91  @return graph_prob: list of probabilities for each path, ordered in the
92  same order as all_paths
93  @return graph_scores: list of tuples, where the first object is the
94  path (list of graphNode objects for each state along the
95  trajectory), and the second object is the score of the path,
96  which can be used to calculate the probability.
97  """
98 
99  # Set manual parameters
100  # cdf_fn - string, name of the file for the cdf
101  cdf_fn = 'cdf.txt'
102  # labeled_pdf_fn - string, name of the file for the labeled pdf
103  labeled_pdf_fn = 'labeled_pdf.txt'
104  # pdf_fn - string, name of the file for the pdf
105  pdf_fn = 'pdf.txt'
106  # npath_fn - string, name of the file for each pathway
107  npath_fn = 'path'
108 
109  # dag_fn - string, filename for the dag image (default: 'dag_heatmap')
110  dag_fn = 'dag_heatmap'
111  # dag_heatmap - Boolean to determine whether or not to write the dag
112  # with a heatmap based on the probability of each state (default: True)
113  dag_heatmap = True
114  # dag_colormap - string, colormap used by the dag to represent probability.
115  # Chooses from those available in matplotlib
116  # (https://matplotlib.org/stable/users/explain/colors/colormaps.html)
117  # (default: "Purples").
118  dag_colormap = "Purples"
119  # dag_draw_label - Boolean to determine whether or not to draw state
120  # labels on the dag
121  dag_draw_label = True
122  # dag_fontname - string, font used for the labels on the dag
123  dag_fontname = "Helvetica"
124  # dag_fontsize - string, font size used for the labels on the dag
125  dag_fontsize = "18"
126  # dag_penscale - float, size of the pen used to draw arrows on the dag
127  dag_penscale = 0.6
128  # dag_arrowsize - float, size of arrows connecting states on the dag
129  dag_arrowsize = 1.2
130  # dag_height - string, height of each node on the dag
131  dag_height = "0.6"
132  # dag_width - string, width of each node on the dag
133  dag_width = "0.6"
134 
135  # Assert that all inputs are the correct variable type
136  if not isinstance(state_dict, dict):
137  raise TypeError("state_dict should be of type dict")
138  if not isinstance(input_dir, str):
139  raise TypeError("input_dir should be of type str")
140  if not isinstance(scorestr, str):
141  raise TypeError("scorestr should be of type str")
142  if not isinstance(spatio_temporal_rule, bool):
143  raise TypeError("state_dict should be of type bool")
144  if not isinstance(subcomplexstr, str):
145  raise TypeError("subcomplexstr should be of type str")
146  if not isinstance(expected_subcomplexes, list):
147  raise TypeError("expected_subcomplexes should be of type list")
148  if not isinstance(score_comp, bool):
149  raise TypeError("score_comp should be of type bool")
150  if not isinstance(exp_comp_map, dict):
151  raise TypeError("exp_comp_map should be of type dict")
152  if not isinstance(out_cdf, bool):
153  raise TypeError("out_cdf should be of type bool")
154  if not isinstance(out_labeled_pdf, bool):
155  raise TypeError("out_labeled_pdf should be of type bool")
156  if not isinstance(out_pdf, bool):
157  raise TypeError("out_pdf should be of type bool")
158  if not isinstance(npaths, int):
159  raise TypeError("npaths should be of type int")
160  if not isinstance(draw_dag, bool):
161  raise TypeError("draw_dag should be of type bool")
162 
163  # check proteins in the exp_comp_map exist in expected_complexes
164  for key in exp_comp_map.keys():
165  found = 0
166  for subcomplex in expected_subcomplexes:
167  if key in subcomplex:
168  found = found + 1
169  if found == 0:
170  warnings.warn(
171  'WARNING!!! Check exp_comp_map and expected_subcomplexes. '
172  'protein ' + key + ' is not found in expected_subcomplexes. '
173  'This could cause illogical results.')
174 
175  # Step 1: Initialize graph with static scores
176  # list of all nodes
177  print('Initialing graph...')
178  nodes = []
179  # keys correspond to all timepoints
180  keys = list(state_dict.keys())
181  # Go to input_dir, if it exists
182  if len(input_dir) > 0:
183  if os.path.exists(input_dir):
184  os.chdir(input_dir)
185  else:
186  raise Exception(
187  "Error!!! Does not exist: " + input_dir + '\nClosing...')
188 
189  # Loop over all keys and all states
190  for key in keys:
191  for i in range(state_dict[key]):
192  index = i + 1
193  node = graphNode.graphNode()
194  node.init_graphNode(key, str(index), scorestr, subcomplexstr,
195  expected_subcomplexes)
196  nodes.append(node)
197 
198  # build up candidate edges in graph
199  tpairs = [(keys[i], keys[i + 1]) for i in range(0, len(keys) - 1)]
200  for a, b in tpairs:
201  # get time marginalized nodes
202  anode = [n for n in nodes if n.get_time() == a]
203  bnode = [n for n in nodes if n.get_time() == b]
204  # draw edges between pairs. Include whether or not to include
205  # spatio_temporal_rule
206  for na, nb in itertools.product(anode, bnode):
207  graphNode.draw_edge(na, nb, spatio_temporal_rule)
208  # set indeces for all nodes. These are unique for each node,
209  # unlike labels, which can overlap
210  for ni, node in enumerate(nodes):
211  node.set_index(ni)
212  print('Done.')
213 
214  # Step 2: Add composition static score to graph
215  if score_comp:
216  print('Calculation composition likelihood...')
217  nodes = composition_scoring.calc_likelihood(exp_comp_map, nodes)
218  print('Done.')
219 
220  # Step 3: Compute all paths, as well as their scores
221  print('Scoring directed acycling graph...')
222  graph, graph_prob, graph_scores = score_graph(nodes, keys)
223  print('Done.')
224 
225  # Step 4: Draw DAG and save relevant output
226  print('Writing output...')
227  # Go to output directory
228  if len(output_dir) > 0:
229  if os.path.exists(output_dir):
230  os.chdir(output_dir)
231  else:
232  os.mkdir(output_dir)
233  os.chdir(output_dir)
234  write_output.write_cdf(out_cdf, cdf_fn, graph_prob)
235  write_output.write_pdf(out_pdf, pdf_fn, graph_prob)
236  write_output.write_labeled_pdf(out_labeled_pdf, labeled_pdf_fn, graph,
237  graph_prob)
238  write_output.write_final_npaths(npaths, npath_fn, graph_scores, graph_prob)
239  if draw_dag:
240  write_output.draw_dag(
241  dag_fn, nodes, graph, graph_prob, keys, heatmap=dag_heatmap,
242  colormap=dag_colormap, draw_label=dag_draw_label,
243  fontname=dag_fontname, fontsize=dag_fontsize,
244  penscale=dag_penscale, arrowsize=dag_arrowsize, height=dag_height,
245  width=dag_width)
246  print('Done.')
247 
248  return nodes, graph, graph_prob, graph_scores
Functions to traverse and score the spatiotemporal graphs.
Definition: score_graph.py:1
Spatialtemporal scoring in IMP.
def create_DAG
This functions streamlines the process of creating a graph by performing all the necessary steps and ...
Definition: create_DAG.py:76
The general base class for IMP exceptions.
Definition: exception.h:48
A class to represent a node in a spatiotemporal process.
Definition: graphNode.py:12