1 """@namespace IMP.EMageFit.solutions_io
2 Utility functions to store and retrieve solution information.
13 log = logging.getLogger(
"solutions_io")
18 ClusterRecord = collections.namedtuple(
20 [
'cluster_id',
'n_elements',
'representative',
'elements',
30 The heapq algorithm is a min-heap. I want a max-heap, that pops the
31 larger values out of the heap.
32 For that I have to modify the comparison function and also set the
33 index that is used for the comparison. The index corresponds to
34 the restraint that we desired to order by
39 Build from a tuple and the index used to compare
42 return tuple.__new__(self, x)
46 Compare. To convert the min-heap into a max-heap, the lower than
47 comparison is transformed into a greater-than
50 if self[i] > other[i]:
55 def __le__(self, other):
57 return self[i] >= other[i]
61 raisef=0.1, orderby=
"em2d"):
63 Reads a set of database files and merge them into a single file.
65 @param fns List of files with databases
66 @param fn_output The database to create
67 @param max_number Maximum number of records to keep, sorted according
69 @param raisef Ratio of problematic database files tolerated before
70 raising an error. This option is to tolerate some files
71 of the databases being broken because the cluster fails,
73 @param orderby Criterion used to sort the records
75 Makes sure to reorder all column names if necessary before merging
76 The record for the native solution is only added once (from first file).
82 names = db.get_table_column_names(tbl)
83 types = db.get_table_types(tbl)
85 sorted_names = [names[i]
for i
in indices]
86 sorted_types = [types[i]
for i
in indices]
89 ind = names.index(orderby)
90 they_are_sorted = field_delim.join(names)
92 sql_command =
"""SELECT %s FROM %s WHERE assignment="native"
93 LIMIT 1 """ % (they_are_sorted, tbl)
94 native_data = db.retrieve_data(sql_command)
96 log.info(
"Gathering results. Saving to %s", fn_output)
98 out_db.create(fn_output, overwrite=
True)
99 out_db.connect(fn_output)
100 out_db.create_table(tbl, sorted_names, sorted_types)
106 log.info(
"Reading %s", fn)
109 sql_command =
"""SELECT %s FROM %s
110 WHERE assignment<>"native"
111 ORDER BY %s ASC LIMIT %s """ % (
112 they_are_sorted, tbl, orderby, max_number)
113 data = db.retrieve_data(sql_command)
114 log.info(
"%s records read from %s", len(data), fn)
119 if len(best_records) < max_number:
120 heapq.heappush(best_records, a)
124 if best_records[0] < a:
125 heapq.heapreplace(best_records, a)
126 except Exception
as e:
127 log.error(
"Error for %s: %s", fn, e)
133 ratio = float(n_problems) / float(len(fns))
135 raise IOError(
"There are %8.1f %s of the database "
136 "files to merge with problems! " % (ratio * 100,
"%"))
138 heapq.heappush(best_records, native_data[0])
139 out_db.store_data(tbl, best_records)
145 Reads a set of database files and puts them in a single file
146 Makes sure to reorder all column names if necessary before merging
147 @param fns List of database files
148 @param fn_output Name of the output database
149 @param raisef See help for gather_best_solution_results()
155 names = db.get_table_column_names(tbl)
156 types = db.get_table_types(tbl)
158 sorted_names = [names[i]
for i
in indices]
159 sorted_types = [types[i]
for i
in indices]
160 log.info(
"Gathering results. Saving to %s", fn_output)
162 out_db.create(fn_output, overwrite=
True)
163 out_db.connect(fn_output)
164 out_db.create_table(tbl, sorted_names, sorted_types)
169 log.info(
"Reading %s", fn)
171 names = sorted(db.get_table_column_names(tbl))
172 they_are_sorted = field_delim.join(names)
173 log.debug(
"Retrieving %s", they_are_sorted)
174 sql_command =
"SELECT %s FROM %s" % (they_are_sorted, tbl)
175 data = db.retrieve_data(sql_command)
176 out_db.store_data(tbl, data)
178 except Exception
as e:
179 log.error(
"Error for file %s: %s", fn, e)
181 ratio = float(n_problems) / float(len(fns))
183 raise IOError(
"There are %8.1f %s of the database "
184 "files to merge with problems! " % (ratio * 100,
"%"))
189 """ Return indices that sort the list ls"""
190 pairs = sorted([(element, i)
for i, element
in enumerate(ls)])
191 indices = [p[1]
for p
in pairs]
198 Recover the reference frame of the n-th best solution from a database.
199 The index Nth stars at 0
202 sql_command =
""" SELECT %s FROM %s
204 ASC LIMIT 1 OFFSET %d """ % (f, tbl, orderby, Nth)
205 data = database.read_data(fn_database, sql_command)
207 raise ValueError(
"The requested %s-th best solution does not exist. "
208 "Only %s solutions found" % (Nth, len(data)))
213 def get_pca(string, delimiter="/"):
214 pca = string.split(delimiter)
215 pca = [float(p)
for p
in pca]
221 Get a list of fields and return a string with them. If there are no
222 fields, return an *, indicating SQL that all the fields are requested
223 @param fields A list of strings
228 return field_delim.join(fields)
235 Class for managing the results of the experiments
238 def __init__(self, ):
240 self.native_table_name =
"native"
241 self.results_table =
"results"
242 self.placements_table =
"placements"
243 self.ccc_table_name =
"ccc"
244 self.cluster_records = []
247 self.results_description_columns = [
"solution_id",
"assignment",
249 self.results_description_types = [int, str, str]
251 self.results_measures_columns = [
"drms",
"cdrms",
"crmsd"]
252 self.results_measures_types = [float, float, float]
256 Build the table of results
257 @param restraints_names The names given to the columns of the table
258 @param add_measures If True, add fields for comparing models
259 and native conformation
261 table_fields = self.results_description_columns + \
262 [
"total_score"] + restraints_names
263 table_types = self.results_description_types + \
264 [float] + [float
for r
in restraints_names]
267 table_fields += self.results_measures_columns
268 table_types += self.results_measures_types
269 log.debug(
"Creating table %s\n%s", table_fields, table_types)
270 self.
create_table(self.results_table, table_fields, table_types)
274 self.native_table_name,
279 max_number=
None, orderby=
False):
282 @param fields Fields to recover from the table
283 @param max_number Maximum number of solutions to recover
284 @param orderby Name of the restraint used for sorting the states
287 log.info(
"Getting %s from solutions", fields)
288 f = self.get_fields_string(fields)
289 sql_command =
"SELECT %s FROM %s " % (f, self.results_table)
291 sql_command +=
" ORDER BY %s ASC" % orderby
292 if max_number
not in (
None,
False):
293 sql_command +=
" LIMIT %d" % (max_number)
294 log.debug(
"Using %s", sql_command)
300 Get solutions from the database.
301 @param fields Fields requested. If the fields are in different
302 tables, a left join is done. Otherwise
303 get_solutions_results_table() is called. See
304 get_solutions_results_table() for the meaning
309 tables = self.get_tables_names()
310 log.debug(
"tables %s", tables)
311 required_tables = set()
312 pairs_table_field = []
316 for f, t
in [(f, t)
for f
in fields
for t
in tables]:
317 if t ==
"native" or f ==
"solution_id":
321 required_tables.add(t)
322 pairs_table_field.append((t, f))
323 required_tables = list(required_tables)
324 log.debug(
"required_tables %s", required_tables)
325 log.debug(
"pairs_table_field %s", pairs_table_field)
326 if len(required_tables) == 0:
330 elif len(required_tables) == 1
and required_tables[0] ==
"results":
334 elif len(required_tables) > 1:
338 sql_command +=
" ORDER BY %s ASC" % orderby
339 log.debug(
"Using %s", sql_command)
343 raise ValueError(
"Fields not found in the database")
347 Recover data for the native solution
348 @param fields Fields to recover
351 f = self.get_fields_string(fields)
352 sql_command =
"SELECT %s FROM %s " % (f, self.native_table_name)
356 def add_record(self, solution_id, assignment, RFs, total_score,
357 restraints_scores, measures):
359 Add a record to the database
360 @param solution_id The key for the solution
361 @param assignment The assignment for the solution provided by
363 @param RFs Reference frames of the rigid bodies of the components
364 of the assembly in the solution
365 @param total_score Total value of the scoring function
366 @param restraints_scores A list with all the values for the
368 @param measures A list with the values of all the measures for
371 words = [io.ReferenceFrameToText(ref).get_text()
for ref
in RFs]
372 RFs_txt = unit_delim.join(words)
373 record = [solution_id, assignment, RFs_txt, total_score] + \
375 if measures
is not None:
376 record = record + measures
377 self.records.append(record)
382 Add a record for the native structure to the database
383 see add_record() for the meaning of the parameters
385 words = [io.ReferenceFrameToText(ref).get_text()
for ref
in RFs]
386 RFs_txt = unit_delim.join(words)
388 record = [solution_id, assignment, RFs_txt, total_score] + \
391 record = record + measures
392 self.
store_data(self.native_table_name, [record])
394 def save_records(self, table="results"):
398 """ both distances and angles are expected to be a list of floats """
399 return [solution_id] + distances + angles
403 Creates a table to store the values of the placement scores for the
405 @param names Names of the components of the assembly
408 self.placement_table_name = self.placements_table
409 table_fields = [
"solution_id"]
410 table_fields += [
"distance_%s" % name
for name
in names]
411 table_fields += [
"angle_%s" % name
for name
in names]
412 table_types = [int] + [float
for f
in table_fields]
414 self.
create_table(self.placement_table_name, table_fields, table_types)
416 table_fields, table_types, check=
True)
418 native_values = [0
for t
in table_fields]
419 log.debug(
"%s", self.native_table_name)
420 log.debug(
"table fields %s", table_fields)
422 table_fields, native_values,
423 [
"assignment"], [
"\"native\""])
427 Return the names of the placement score fields in the database
431 col
for col
in columns
if "distance" in col
or "angle" in col]
436 Add a table to the database for store the values of the cross
437 correlation coefficient between a model and the native
442 table_fields = [
"solution_id",
"ccc"]
443 table_types = [int, float]
445 self.
create_table(self.ccc_table_name, table_fields, table_types)
448 table_fields, table_types, check=
True)
450 table_fields, [0, 1.00],
451 [
"assignment"], [
"\"native\""])
454 """ Format for the record to store in the ccc table """
455 return [solution_id, ccc]
459 Recover the cross-correlation coefficient for a solution
462 sql_command =
""" SELECT ccc FROM %s
463 WHERE solution_id=%d """ % (self.ccc_table_name,
468 def store_ccc_data(self, ccc_data):
469 self.
store_data(self.ccc_table_name, ccc_data)
471 def store_placement_data(self, data):
472 log.debug(
"store placement table %s", data)
473 self.
store_data(self.placement_table_name, data)
477 Format a left join SQL command that recovers all fields from the
479 @param pairs_table_field Pairs of (table,field)
480 @param tables_names Names of the tables
482 E.g. If pairs_table_filed = ((table1,a), (table2,b), (table3,c),
483 (table2,d)) and tables_names = (table1, table2, table3)
486 SELECT table1.a, table2.b, table3.c, table2.d FROM table1
487 LEFT JOIN table2 ON table1.solution_id = table2.solution_id
488 LEFT JOIN table3 ON table1.solution_id = table3.solution_id
489 WHERE table1.solution_id IS NOT NULL AND
490 table2.solution_id IS NOT NULL AND
491 table3.solution_id IS NOT NULL
494 txt = [
"%s.%s" % (p[0], p[1])
for p
in pairs_table_field]
495 fields_requested = field_delim.join(txt)
496 sql_command =
" SELECT %s FROM %s " % (
497 fields_requested, tables_names[0])
498 n_tables = len(tables_names)
499 for i
in range(1, n_tables):
500 a = tables_names[i - 1]
502 sql_command +=
" LEFT JOIN %s " \
503 "ON %s.solution_id = %s.solution_id " % (b, a, b)
506 for i
in range(n_tables - 1):
507 sql_command +=
"WHERE %s.solution_id " \
508 "IS NOT NULL AND " % tables_names[
510 sql_command +=
" %s.solution_id IS NOT NULL " % tables_names[
512 log.debug(
"%s" % sql_command)
517 Add a table to store information about the clusters of structures
518 @param name Name of the table
520 self.cluster_table_name = name
522 table_fields = (
"cluster_id",
"n_elements",
523 "representative",
"elements",
"solutions_ids")
524 table_types = (int, int, int, str, str)
529 elements, solutions_ids):
531 Add a record to the cluster database. Actually, only stores it
532 in a list (that will be added later)
533 @param cluster_id Number with the id of the cluster
534 @param n_elements Number of elements in the cluster
535 @param representative Number with the id of the representative
537 @param elements List with the number of the elements of the cluster
538 @param solutions_ids The numbers above are provided by the
539 clustering algorithm. The solutions_ids are the ids of the models
543 record = (cluster_id, n_elements, representative, elements,
545 log.debug(
"Adding cluster record: %s", record)
546 self.cluster_records.append(record)
550 Store the data for the clusters
552 log.info(
"Storing data of clusters. Number of records %s",
553 len(self.cluster_records))
554 self.
store_data(self.cluster_table_name, self.cluster_records)
558 Recover solutions for a specific list of results
559 @param fields Fields to recover from the database
560 @param solutions_ids A list with the desired solutions.
563 sql_command =
""" SELECT %s FROM %s WHERE solution_id IN (%s) """
564 f = self.get_fields_string(fields)
565 str_ids =
",".join(map(str, solutions_ids))
568 (f, self.results_table, str_ids))
573 Get the position of the native configuration
574 @param orderby Criterion used to sort the solutions
579 native_value = data[0][0]
582 values = [row[0]
for row
in data]
583 rank = np.searchsorted(values, native_value)
588 Recover the information about the n-th largest cluster
589 @param position Cluster position (by size) requested
590 (1 is the largest cluster)
591 @param table_name Table where the information about the
594 s =
""" SELECT * FROM %s ORDER BY n_elements DESC """ % table_name
596 record = ClusterRecord(*data[position - 1])
601 Recovers from the database the placement scores for a set of
602 solutions, and returns the mean and standard deviation of the
603 placement score for each of the components of the complex being
604 scored. This function will be typically used to compute the
605 variation of the placement of each component within a cluster
607 @param solutions_ids The ids of the solutions used to compute
609 @return The output are 4 numpy vectors:
610 placement_distances_mean - The mean placement distance for each
612 placement_distances_stddev - The standard deviation of the
613 placement distance for each
615 placement_angles_mean - The mean placement angle for each
617 placement_angles_stddev - The standard deviation of the
618 placement angle for each component.
622 table = self.placements_table
624 distance_fields = [x
for x
in fields
if 'distance' in x]
625 angle_fields = [x
for x
in fields
if 'angle' in x]
626 sql_command =
""" SELECT %s FROM %s WHERE solution_id IN (%s) """
628 str_ids =
",".join(map(str, solutions_ids))
629 log.debug(
"Solutions considered %s", solutions_ids)
630 s = sql_command % (
",".join(distance_fields), table, str_ids)
632 s = sql_command % (
",".join(angle_fields), table, str_ids)
634 D = np.array(data_distances)
635 placement_distances_mean = D.mean(axis=0)
636 placement_distances_stddev = D.std(axis=0)
637 A = np.array(data_angles)
638 placement_angles_mean = A.mean(axis=0)
639 placement_angles_stddev = A.std(axis=0)
640 return [placement_distances_mean, placement_distances_stddev,
641 placement_angles_mean, placement_angles_stddev]
645 Calculate the placement score and its standard deviation for
646 the complexes in a set of solutions. The values returned are
647 averages, as the placement score for a complex is the average
648 of the placement scores of the components. This function is used
649 to obtain global placement for a cluster of solutions.
650 @param solutions_ids The ids of the solutions used to compute
652 @return The output are 4 values:
653 plcd_mean - Average of the placement distance for the entire
654 complex over all the solutions.
655 plcd_std - Standard deviation of the placement distance for
656 the entire complex over all the solutions.
657 plca_mean - Average of the placement angle for the entire
658 complex over all the solutions.
659 plca_std - Standard deviation of the placement angle for
660 the entire complex over all the solutions.
662 [placement_distances_mean, placement_distances_stddev,
663 placement_angles_mean, placement_angles_stddev] = \
665 plcd_mean = placement_distances_mean.mean(axis=0)
666 plcd_std = placement_distances_stddev.mean(axis=0)
667 plca_mean = placement_angles_mean.mean(axis=0)
668 plca_std = placement_angles_stddev.mean(axis=0)
669 return [plcd_mean, plcd_std, plca_mean, plca_std]
def get_left_join_command
Format a left join SQL command that recovers all fields from the tables given.
def get_table_column_names
Get the names of the columns for a given table.
def __new__
Build from a tuple and the index used to compare.
Class to manage a SQL database built with sqlite3.
def gather_solution_results
Reads a set of database files and puts them in a single file Makes sure to reorder all column names i...
def add_clusters_table
Add a table to store information about the clusters of structures.
def get_placement_fields
Return the names of the placement score fields in the database.
def store_data
Inserts information in a given table of the database.
def add_ccc_table
Add a table to the database for store the values of the cross correlation coefficient between a model...
def format_placement_record
both distances and angles are expected to be a list of floats
The heapq algorithm is a min-heap.
def create_table
Creates a table.
def get_solutions
Get solutions from the database.
def retrieve_data
Retrieves data from the database using the sql_command returns the records as a list of tuples...
def add_record
Add a record to the database.
def gather_best_solution_results
Reads a set of database files and merge them into a single file.
Utility functions to handle IO.
Utility functions to manage SQL databases with sqlite3.
def add_placement_scores_table
Creates a table to store the values of the placement scores for the models.
def get_best_solution
Recover the reference frame of the n-th best solution from a database.
def drop_table
Delete a table if it exists.
def get_sorting_indices
Return indices that sort the list ls.
def get_individual_placement_statistics
Recovers from the database the placement scores for a set of solutions, and returns the mean and stan...
def get_native_solution
Recover data for the native solution.
def add_native_record
Add a record for the native structure to the database see add_record() for the meaning of the paramet...
Class for managing the results of the experiments.
def add_cluster_record
Add a record to the cluster database.
def get_placement_statistics
Calculate the placement score and its standard deviation for the complexes in a set of solutions...
def get_nth_largest_cluster
Recover the information about the n-th largest cluster.
def store_cluster_data
Store the data for the clusters.
def add_results_table
Build the table of results.
def get_fields_string
Get a list of fields and return a string with them.
def get_ccc
Recover the cross-correlation coefficient for a solution.
def format_ccc_record
Format for the record to store in the ccc table.
def add_columns
Add columns to the database.
def get_native_rank
Get the position of the native configuration.
def get_solutions_from_list
Recover solutions for a specific list of results.
def get_solutions_results_table
Recovers solutions.
def update_data
updates the register in the table identified by the condition values for the condition fields ...
def check_if_is_connected
Checks if the class is connected to the database filename.