1 """@namespace IMP.EMageFit.solutions_io
2 Utility functions to store and retrieve solution information.
19 log = logging.getLogger(
"solutions_io")
24 ClusterRecord = collections.namedtuple(
'ClusterRecord',
25 [
'cluster_id',
'n_elements',
'representative',
26 'elements',
'solutions_ids'])
37 The heapq algorithm is a min-heap. I want a max-heap, that pops the
38 larger values out of the heap.
39 For that I have to modify the comparison function and also set the
40 index that is used for the comparison. The index corresponds to
41 the restraint that we desired to order by
46 Build from a tuple and the index used to compare
49 return tuple.__new__(self, x)
53 Compare. To convert the min-heap into a max-heap, the lower than
54 comparison is transformed into a greater-than
57 if(self[i] > other[i]):
62 def __le__(self, other):
64 return self[i] >= other[i]
68 raisef=0.1, orderby=
"em2d"):
70 Reads a set of database files and merge them into a single file.
72 @param fns List of files with databases
73 @param fn_output The database to create
74 @param max_number Maximum number of records to keep, sorted according
76 @param raisef Ratio of problematic database files tolerated before
77 raising an error. This option is to tolerate some files
78 of the databases being broken because the cluster fails,
80 @param orderby Criterium used to sort the the records
82 Makes sure to reorder all column names if neccesary before merging
83 The record for the native solution is only added once (from first file).
89 names = db.get_table_column_names(tbl)
90 types = db.get_table_types(tbl)
92 sorted_names = [names[i]
for i
in indices]
93 sorted_types = [types[i]
for i
in indices]
96 ind = names.index(orderby)
97 they_are_sorted = field_delim.join(names)
99 sql_command =
"""SELECT %s FROM %s
100 WHERE assignment="native" LIMIT 1 """ % (they_are_sorted, tbl)
101 native_data = db.retrieve_data(sql_command)
103 log.info(
"Gathering results. Saving to %s", fn_output)
105 out_db.create(fn_output, overwrite=
True)
106 out_db.connect(fn_output)
107 out_db.create_table(tbl, sorted_names, sorted_types)
113 log.info(
"Reading %s", fn)
116 sql_command =
"""SELECT %s FROM %s
117 WHERE assignment<>"native"
118 ORDER BY %s ASC LIMIT %s """ % (
119 they_are_sorted, tbl, orderby, max_number)
120 data = db.retrieve_data(sql_command)
121 log.info(
"%s records read from %s", len(data), fn)
126 if(len(best_records) < max_number):
127 heapq.heappush(best_records, a)
131 if(best_records[0] < a):
132 heapq.heapreplace(best_records, a)
133 except Exception
as e:
134 log.error(
"Error for %s: %s", fn, e)
140 ratio = float(n_problems) / float(len(fns))
142 raise IOError(
"There are %8.1f %s of the database "
143 "files to merge with problems! " % (ratio * 100,
"%"))
145 heapq.heappush(best_records, native_data[0])
146 out_db.store_data(tbl, best_records)
152 Reads a set of database files and puts them in a single file
153 Makes sure to reorder all column names if neccesary before merging
154 @param fns List of database files
155 @param fn_output Name of the output database
156 @param raisef See help for gather_best_solution_results()
162 names = db.get_table_column_names(tbl)
163 types = db.get_table_types(tbl)
165 sorted_names = [names[i]
for i
in indices]
166 sorted_types = [types[i]
for i
in indices]
167 log.info(
"Gathering results. Saving to %s", fn_output)
169 out_db.create(fn_output, overwrite=
True)
170 out_db.connect(fn_output)
171 out_db.create_table(tbl, sorted_names, sorted_types)
176 log.info(
"Reading %s", fn)
178 names = sorted(db.get_table_column_names(tbl))
179 they_are_sorted = field_delim.join(names)
180 log.debug(
"Retrieving %s", they_are_sorted)
181 sql_command =
"SELECT %s FROM %s" % (they_are_sorted, tbl)
182 data = db.retrieve_data(sql_command)
183 out_db.store_data(tbl, data)
185 except Exception
as e:
186 log.error(
"Error for file %s: %s", fn, e)
188 ratio = float(n_problems) / float(len(fns))
190 raise IOError(
"There are %8.1f %s of the database "
191 "files to merge with problems! " % (ratio * 100,
"%"))
196 """ Return indices that sort the list l """
197 pairs = sorted([(element, i)
for i, element
in enumerate(l)])
198 indices = [p[1]
for p
in pairs]
205 Recover the reference frame of the n-th best solution from a database.
206 The index Nth stars at 0
209 sql_command =
""" SELECT %s FROM %s
211 ASC LIMIT 1 OFFSET %d """ % (f, tbl, orderby, Nth)
212 data = database.read_data(fn_database, sql_command)
214 raise ValueError(
"The requested %s-th best solution does not exist. "
215 "Only %s solutions found" % (Nth, len(data)))
220 def get_pca(string, delimiter="/"):
221 pca = string.split(delimiter)
222 pca = [float(p)
for p
in pca]
228 Get a list of fields and return a string with them. If there are no
229 fields, return an *, indicating SQL that all the fields are requested
230 @param fields A list of strings
235 return field_delim.join(fields)
242 Class for managing the results of the experiments
245 def __init__(self, ):
247 self.native_table_name =
"native"
248 self.results_table =
"results"
249 self.placements_table =
"placements"
250 self.ccc_table_name =
"ccc"
251 self.cluster_records = []
254 self.results_description_columns = [
"solution_id",
"assignment",
256 self.results_description_types = [int, str, str]
258 self.results_measures_columns = [
"drms",
"cdrms",
"crmsd"]
259 self.results_measures_types = [float, float, float]
263 Build the table of results
264 @param restraints_names The names given to the columns of the table
265 @param add_measures If True, add fields for comparing models
266 and native conformation
268 table_fields = self.results_description_columns + \
269 [
"total_score"] + restraints_names
270 table_types = self.results_description_types + \
271 [float] + [float
for r
in restraints_names]
274 table_fields += self.results_measures_columns
275 table_types += self.results_measures_types
276 log.debug(
"Creating table %s\n%s", table_fields, table_types)
277 self.
create_table(self.results_table, table_fields, table_types)
281 self.native_table_name,
286 max_number=
None, orderby=
False):
289 @param fields Fields to recover from the table
290 @param max_number Maximum number of solutions to recover
291 @param orderby Name of the restraint used for sorting the states
294 log.info(
"Getting %s from solutions", fields)
295 f = self.get_fields_string(fields)
296 sql_command =
"SELECT %s FROM %s " % (f, self.results_table)
298 sql_command +=
" ORDER BY %s ASC" % orderby
299 if max_number
not in (
None,
False):
300 sql_command +=
" LIMIT %d" % (max_number)
301 log.debug(
"Using %s", sql_command)
307 Get solutions from the database.
308 @param fields Fields requested. If the fields are in different
309 tables, a left join is done. Otherwise get_solutions_results_table()
310 is called. See get_solutions_results_table() for the meaning
315 tables = self.get_tables_names()
316 log.debug(
"tables %s", tables)
317 required_tables = set()
318 pairs_table_field = []
322 for f, t
in [(f, t)
for f
in fields
for t
in tables]:
323 if t ==
"native" or f ==
"solution_id":
327 required_tables.add(t)
328 pairs_table_field.append((t, f))
329 required_tables = list(required_tables)
330 log.debug(
"required_tables %s", required_tables)
331 log.debug(
"pairs_table_field %s", pairs_table_field)
332 if len(required_tables) == 0:
336 elif len(required_tables) == 1
and required_tables[0] ==
"results":
340 elif len(required_tables) > 1:
344 sql_command +=
" ORDER BY %s ASC" % orderby
345 log.debug(
"Using %s", sql_command)
349 raise ValueError(
"Fields not found in the database")
353 Recover data for the native solution
354 @param fields Fields to recover
357 f = self.get_fields_string(fields)
358 sql_command =
"SELECT %s FROM %s " % (f, self.native_table_name)
362 def add_record(self, solution_id, assignment, RFs, total_score,
363 restraints_scores, measures):
365 Add a recorde to the database
366 @param solution_id The key for the solution
367 @param assignment The assigment for the solution provided by
369 @param RFs Reference frames of the rigid bodies of the components
370 of the assembly in the solution
371 @param total_score Total value of the scoring function
372 @param restraints_scores A list with all the values for the
374 @param measures A list with the values of all the measures for
377 words = [io.ReferenceFrameToText(ref).get_text()
for ref
in RFs]
378 RFs_txt = unit_delim.join(words)
379 record = [solution_id, assignment, RFs_txt, total_score] + \
381 if measures
is not None:
382 record = record + measures
383 self.records.append(record)
388 Add a record for the native structure to the database
389 see add_record() for the meaning of the parameters
391 words = [io.ReferenceFrameToText(ref).get_text()
for ref
in RFs]
392 RFs_txt = unit_delim.join(words)
394 record = [solution_id, assignment, RFs_txt, total_score] + \
397 record = record + measures
398 self.
store_data(self.native_table_name, [record])
400 def save_records(self, table="results"):
404 """ both distances and angles are expected to be a list of floats """
405 return [solution_id] + distances + angles
409 Creates a table to store the values of the placement scores for the
411 @param names Names of the components of the assembly
414 self.placement_table_name = self.placements_table
415 table_fields = [
"solution_id"]
416 table_fields += [
"distance_%s" % name
for name
in names]
417 table_fields += [
"angle_%s" % name
for name
in names]
418 table_types = [int] + [float
for f
in table_fields]
420 self.
create_table(self.placement_table_name, table_fields, table_types)
422 table_fields, table_types, check=
True)
424 native_values = [0
for t
in table_fields]
425 log.debug(
"%s", self.native_table_name)
426 log.debug(
"table fields %s", table_fields)
428 table_fields, native_values,
429 [
"assignment"], [
"\"native\""])
433 Return the names of the placement score fields in the database
437 col
for col
in columns
if "distance" in col
or "angle" in col]
442 Add a table to the database for store the values of the cross
443 correlation coefficient between a model and the native configuration
447 table_fields = [
"solution_id",
"ccc"]
448 table_types = [int, float]
450 self.
create_table(self.ccc_table_name, table_fields, table_types)
453 table_fields, table_types, check=
True)
455 table_fields, [0, 1.00], [
"assignment"], [
"\"native\""])
458 """ Format for the record to store in the ccc table """
459 return [solution_id, ccc]
463 Recover the cross-correlation coefficient for a solution
466 sql_command =
""" SELECT ccc FROM %s
467 WHERE solution_id=%d """ % (self.ccc_table_name,
472 def store_ccc_data(self, ccc_data):
473 self.
store_data(self.ccc_table_name, ccc_data)
475 def store_placement_data(self, data):
476 log.debug(
"store placement table %s", data)
477 self.
store_data(self.placement_table_name, data)
481 Format a left join SQL command that recovers all fileds from the
483 @param pairs_table_field Pairs of (table,field)
484 @param tables_names Names of the tables
486 E.g. If pairs_table_filed = ((table1,a), (table2,b), (table3,c),
487 (table2,d)) and tables_names = (table1, table2, table3)
490 SELECT table1.a, table2.b, table3.c, table2.d FROM table1
491 LEFT JOIN table2 ON table1.solution_id = table2.solution_id
492 LEFT JOIN table3 ON table1.solution_id = table3.solution_id
493 WHERE table1.solution_id IS NOT NULL AND
494 table2.solution_id IS NOT NULL AND
495 table3.solution_id IS NOT NULL
498 txt = [
"%s.%s" % (p[0], p[1])
for p
in pairs_table_field]
499 fields_requested = field_delim.join(txt)
500 sql_command =
" SELECT %s FROM %s " % (
501 fields_requested, tables_names[0])
502 n_tables = len(tables_names)
503 for i
in range(1, n_tables):
504 a = tables_names[i - 1]
506 sql_command +=
" LEFT JOIN %s " \
507 "ON %s.solution_id = %s.solution_id " % (b, a, b)
510 for i
in range(n_tables - 1):
511 sql_command +=
"WHERE %s.solution_id " \
512 "IS NOT NULL AND " % tables_names[
514 sql_command +=
" %s.solution_id IS NOT NULL " % tables_names[
516 log.debug(
"%s" % sql_command)
521 Add a table to store information about the clusters of structures
522 @param name Name of the table
524 self.cluster_table_name = name
526 table_fields = (
"cluster_id",
"n_elements",
527 "representative",
"elements",
"solutions_ids")
528 table_types = (int, int, int, str, str)
533 elements, solutions_ids):
535 Add a record to the cluster database. Actually, only stores it
536 in a list (that will be added later)
537 @param cluster_id Number with the id of the cluster
538 @param n_elements Number of elements in the cluster
539 @param representative Number with the id of the representative
541 @param elements List with the number of the elements of the cluster
542 @param solutions_ids The numbers above are provided by the
543 clustering algorithm. The solutions_ids are the ids of the models
547 record = (cluster_id, n_elements, representative, elements,
549 log.debug(
"Adding cluster record: %s", record)
550 self.cluster_records.append(record)
554 Store the data for the clusters
556 log.info(
"Storing data of clusters. Number of records %s",
557 len(self.cluster_records))
558 self.
store_data(self.cluster_table_name, self.cluster_records)
562 Recover solutions for a specific list of results
563 @param fields Fields to recover fro the database
564 @param solutions_ids A list with the desired solutions. E.g. [0,3,6]
566 sql_command =
""" SELECT %s FROM %s WHERE solution_id IN (%s) """
567 f = self.get_fields_string(fields)
568 str_ids =
",".join(map(str, solutions_ids))
571 (f, self.results_table, str_ids))
576 Get the position of the native configuration
577 @param orderby Criterium used to sort the solutions
582 native_value = data[0][0]
585 values = [row[0]
for row
in data]
586 rank = np.searchsorted(values, native_value)
591 Recover the the information about the n-th largest cluster
592 @param position Cluster position (by size) requested
593 (1 is the largest cluster)
594 @param table_name Table where the information about the
597 s =
""" SELECT * FROM %s ORDER BY n_elements DESC """ % table_name
599 record = ClusterRecord(*data[position - 1])
604 Recovers from the database the placement scores for a set of
605 solutions, and returns the mean and standard deviation of the
606 placement score for each of the components of the complex being
607 scored. This function will be typical used to compute the variation
608 of the placement of each component within a cluster of solutions
609 @param solutions_ids The ids of the solutions used to compute
611 @return The output are 4 numpy vectors:
612 placement_distances_mean - The mean placement distance for each
614 placement_distances_stddev - The standardd deviation of the
615 placement distance for each component
616 placement_angles_mean - The mean placement angle for each
618 placement_angles_stddev - The standard deviation of the placement
619 angle for each component,
623 table = self.placements_table
625 distance_fields = [x
for x
in fields
if 'distance' in x]
626 angle_fields = [x
for x
in fields
if 'angle' in x]
627 sql_command =
""" SELECT %s FROM %s WHERE solution_id IN (%s) """
629 str_ids =
",".join(map(str, solutions_ids))
630 log.debug(
"Solutions considered %s", solutions_ids)
631 s = sql_command % (
",".join(distance_fields), table, str_ids)
633 s = sql_command % (
",".join(angle_fields), table, str_ids)
635 D = np.array(data_distances)
636 placement_distances_mean = D.mean(axis=0)
637 placement_distances_stddev = D.std(axis=0)
638 A = np.array(data_angles)
639 placement_angles_mean = A.mean(axis=0)
640 placement_angles_stddev = A.std(axis=0)
641 return [placement_distances_mean, placement_distances_stddev,
642 placement_angles_mean, placement_angles_stddev]
646 Calculate the placement score and its standard deviation for
647 the complexes in a set of solutions. The values returned are
648 averages, as the placement score for a complex is the average
649 of the placement scores of the components. This function is used
650 to obtain global placement for a cluster of solutions.
651 @param solutions_ids The ids of the solutions used to compute
653 @return The output are 4 values:
654 plcd_mean - Average of the placement distance for the entire
655 complex over all the solutions.
656 plcd_std - Standard deviation of the placement distance for
657 the entire complex over all the solutions.
658 plca_mean - Average of the placement angle for the entire
659 complex over all the solutions.
660 plca_std - Standard deviation of the placement angle for
661 the entire complex over all the solutions.
663 [placement_distances_mean, placement_distances_stddev,
664 placement_angles_mean, placement_angles_stddev] = \
666 plcd_mean = placement_distances_mean.mean(axis=0)
667 plcd_std = placement_distances_stddev.mean(axis=0)
668 plca_mean = placement_angles_mean.mean(axis=0)
669 plca_std = placement_angles_stddev.mean(axis=0)
670 return [plcd_mean, plcd_std, plca_mean, plca_std]
def get_left_join_command
Format a left join SQL command that recovers all fileds from the tables given.
def get_table_column_names
Get the names of the columns for a given table.
def __new__
Build from a tuple and the index used to compare.
Class to manage a SQL database built with sqlite3.
def gather_solution_results
Reads a set of database files and puts them in a single file Makes sure to reorder all column names i...
def add_clusters_table
Add a table to store information about the clusters of structures.
def get_placement_fields
Return the names of the placement score fields in the database.
def store_data
Inserts information in a given table of the database.
def add_ccc_table
Add a table to the database for store the values of the cross correlation coefficient between a model...
def format_placement_record
both distances and angles are expected to be a list of floats
The heapq algorithm is a min-heap.
def create_table
Creates a table.
def get_solutions
Get solutions from the database.
def retrieve_data
Retrieves data from the database using the sql_command returns the records as a list of tuples...
def add_record
Add a recorde to the database.
def gather_best_solution_results
Reads a set of database files and merge them into a single file.
Utility functions to handle IO.
Utility functions to manage SQL databases with sqlite3.
def add_placement_scores_table
Creates a table to store the values of the placement scores for the models.
def get_best_solution
Recover the reference frame of the n-th best solution from a database.
def drop_table
Delete a table if it exists.
def get_sorting_indices
Return indices that sort the list l.
def get_individual_placement_statistics
Recovers from the database the placement scores for a set of solutions, and returns the mean and stan...
def get_native_solution
Recover data for the native solution.
def add_native_record
Add a record for the native structure to the database see add_record() for the meaning of the paramet...
Class for managing the results of the experiments.
def add_cluster_record
Add a record to the cluster database.
def get_placement_statistics
Calculate the placement score and its standard deviation for the complexes in a set of solutions...
def get_nth_largest_cluster
Recover the the information about the n-th largest cluster.
def store_cluster_data
Store the data for the clusters.
def add_results_table
Build the table of results.
def get_fields_string
Get a list of fields and return a string with them.
def get_ccc
Recover the cross-correlation coefficient for a solution.
def format_ccc_record
Format for the record to store in the ccc table.
def add_columns
Add columns to the database.
def get_native_rank
Get the position of the native configuration.
def get_solutions_from_list
Recover solutions for a specific list of results.
def get_solutions_results_table
Recovers solutions.
def update_data
updates the register in the table identified by the condition values for the condition fields ...
def check_if_is_connected
Checks if the class is connected to the database filename.