1 """@namespace IMP.em2d.solutions_io
2 Utility functions to store and retrieve solution information.
5 import IMP.em2d.imp_general.io
as io
21 from sets
import Set
as set
23 log = logging.getLogger(
"solutions_io")
31 """Simple named tuple class"""
33 class _itemgetter(object):
35 def __init__(self, ind):
38 def __call__(self, obj):
39 return obj[self.__ind]
41 def __init__(self, iterable):
42 if len(iterable) != self.__n_fields:
43 raise TypeError(
"Expected %d arguments, got %d"
44 % (self.__n_fields, len(iterable)))
45 tuple.__init__(self, iterable)
48 cluster_id = property(_itemgetter(0))
49 n_elements = property(_itemgetter(1))
50 representative = property(_itemgetter(2))
51 elements = property(_itemgetter(3))
52 solutions_ids = property(_itemgetter(4))
64 The heapq algorithm is a min-heap. I want a max-heap, that pops the
65 larger values out of the heap.
66 For that I have to modify the comparison function and also set the
67 index that is used for the comparison. The index corresponds to
68 the restraint that we desired to order by
71 def __new__(self, x, i):
73 Build from a tuple and the index used to compare
76 return tuple.__new__(self, x)
78 def __lt__(self, other):
80 Compare. To convert the min-heap into a max-heap, the lower than
81 comparison is transformed into a greater-than
84 if(self[i] > other[i]):
89 def __le__(self, other):
91 return self[i] >= other[i]
95 raisef=0.1, orderby=
"em2d"):
97 Reads a set of database files and merge them into a single file.
99 @param fns List of files with databases
100 @param fn_output The database to create
101 @param max_number Maximum number of records to keep, sorted according
103 @param raisef Ratio of problematic database files tolerated before
104 raising an error. This option is to tolerate some files
105 of the databases being broken because the cluster fails,
107 @param orderby Criterium used to sort the the records
109 Makes sure to reorder all column names if neccesary before merging
110 The record for the native solution is only added once (from first file).
116 names = db.get_table_column_names(tbl)
117 types = db.get_table_types(tbl)
119 sorted_names = [names[i]
for i
in indices]
120 sorted_types = [types[i]
for i
in indices]
123 ind = names.index(orderby)
124 they_are_sorted = field_delim.join(names)
126 sql_command =
"""SELECT %s FROM %s
127 WHERE assignment="native" LIMIT 1 """ % (they_are_sorted, tbl)
128 native_data = db.retrieve_data(sql_command)
130 log.info(
"Gathering results. Saving to %s", fn_output)
132 out_db.create(fn_output, overwrite=
True)
133 out_db.connect(fn_output)
134 out_db.create_table(tbl, sorted_names, sorted_types)
140 log.info(
"Reading %s", fn)
143 sql_command =
"""SELECT %s FROM %s
144 WHERE assignment<>"native"
145 ORDER BY %s ASC LIMIT %s """ % (
146 they_are_sorted, tbl, orderby, max_number)
147 data = db.retrieve_data(sql_command)
148 log.info(
"%s records read from %s", len(data), fn)
153 if(len(best_records) < max_number):
154 heapq.heappush(best_records, a)
158 if(best_records[0] < a):
159 heapq.heapreplace(best_records, a)
160 except Exception
as e:
161 log.error(
"Error for %s: %s", fn, e)
167 ratio = float(n_problems) / float(len(fns))
169 raise IOError(
"There are %8.1f %s of the database "
170 "files to merge with problems! " % (ratio * 100,
"%"))
172 heapq.heappush(best_records, native_data[0])
173 out_db.store_data(tbl, best_records)
179 Reads a set of database files and puts them in a single file
180 Makes sure to reorder all column names if neccesary before merging
181 @param fns List of database files
182 @param fn_output Name of the output database
183 @param raisef See help for gather_best_solution_results()
189 names = db.get_table_column_names(tbl)
190 types = db.get_table_types(tbl)
192 sorted_names = [names[i]
for i
in indices]
193 sorted_types = [types[i]
for i
in indices]
194 log.info(
"Gathering results. Saving to %s", fn_output)
196 out_db.create(fn_output, overwrite=
True)
197 out_db.connect(fn_output)
198 out_db.create_table(tbl, sorted_names, sorted_types)
203 log.info(
"Reading %s", fn)
205 names = sorted(db.get_table_column_names(tbl))
206 they_are_sorted = field_delim.join(names)
207 log.debug(
"Retrieving %s", they_are_sorted)
208 sql_command =
"SELECT %s FROM %s" % (they_are_sorted, tbl)
209 data = db.retrieve_data(sql_command)
210 out_db.store_data(tbl, data)
212 except Exception
as e:
213 log.error(
"Error for file %s: %s", fn, e)
215 ratio = float(n_problems) / float(len(fns))
217 raise IOError(
"There are %8.1f %s of the database "
218 "files to merge with problems! " % (ratio * 100,
"%"))
223 """ Return indices that sort the list l """
224 pairs = sorted([(element, i)
for i, element
in enumerate(l)])
225 indices = [p[1]
for p
in pairs]
232 Recover the reference frame of the n-th best solution from a database.
233 The index Nth stars at 0
236 sql_command =
""" SELECT %s FROM %s
238 ASC LIMIT 1 OFFSET %d """ % (f, tbl, orderby, Nth)
239 data = Database.read_data(fn_database, sql_command)
241 raise ValueError(
"The requested %s-th best solution does not exist. "
242 "Only %s solutions found" % (Nth, len(data)))
247 def get_pca(string, delimiter="/"):
248 pca = string.split(delimiter)
249 pca = [float(p)
for p
in pca]
255 Get a list of fields and return a string with them. If there are no
256 fields, return an *, indicating SQL that all the fields are requested
257 @param fields A list of strings
262 return field_delim.join(fields)
269 Class for managing the results of the experiments
272 def __init__(self, ):
274 self.native_table_name =
"native"
275 self.results_table =
"results"
276 self.placements_table =
"placements"
277 self.ccc_table_name =
"ccc"
278 self.cluster_records = []
281 self.results_description_columns = [
"solution_id",
"assignment",
283 self.results_description_types = [int, str, str]
285 self.results_measures_columns = [
"drms",
"cdrms",
"crmsd"]
286 self.results_measures_types = [float, float, float]
290 Build the table of results
291 @param restraints_names The names given to the columns of the table
292 @param add_measures If True, add fields for comparing models
293 and native conformation
295 table_fields = self.results_description_columns + \
296 [
"total_score"] + restraints_names
297 table_types = self.results_description_types + \
298 [float] + [float
for r
in restraints_names]
301 table_fields += self.results_measures_columns
302 table_types += self.results_measures_types
303 log.debug(
"Creating table %s\n%s", table_fields, table_types)
304 self.
create_table(self.results_table, table_fields, table_types)
308 self.native_table_name,
313 max_number=
None, orderby=
False):
316 @param fields Fields to recover from the table
317 @param max_number Maximum number of solutions to recover
318 @param orderby Name of the restraint used for sorting the states
321 log.info(
"Getting %s from solutions", fields)
322 f = self.get_fields_string(fields)
323 sql_command =
"SELECT %s FROM %s " % (f, self.results_table)
325 sql_command +=
" ORDER BY %s ASC" % orderby
326 if max_number
not in (
None,
False):
327 sql_command +=
" LIMIT %d" % (max_number)
328 log.debug(
"Using %s", sql_command)
334 Get solutions from the database.
335 @param fields Fields requested. If the fields are in different
336 tables, a left join is done. Otherwise get_solutions_results_table()
337 is called. See get_solutions_results_table() for the meaning
342 tables = self.get_tables_names()
343 log.debug(
"tables %s", tables)
344 required_tables = set()
345 pairs_table_field = []
349 for f, t
in [(f, t)
for f
in fields
for t
in tables]:
350 if t ==
"native" or f ==
"solution_id":
354 required_tables.add(t)
355 pairs_table_field.append((t, f))
356 required_tables = list(required_tables)
357 log.debug(
"required_tables %s", required_tables)
358 log.debug(
"pairs_table_field %s", pairs_table_field)
359 if len(required_tables) == 0:
363 elif len(required_tables) == 1
and required_tables[0] ==
"results":
367 elif len(required_tables) > 1:
371 sql_command +=
" ORDER BY %s ASC" % orderby
372 log.debug(
"Using %s", sql_command)
376 raise ValueError(
"Fields not found in the database")
380 Recover data for the native solution
381 @param fields Fields to recover
384 f = self.get_fields_string(fields)
385 sql_command =
"SELECT %s FROM %s " % (f, self.native_table_name)
389 def add_record(self, solution_id, assignment, RFs, total_score,
390 restraints_scores, measures):
392 Add a recorde to the database
393 @param solution_id The key for the solution
394 @param assignment The assigment for the solution provided by
396 @param RFs Reference frames of the rigid bodies of the components
397 of the assembly in the solution
398 @param total_score Total value of the scoring function
399 @param restraints_scores A list with all the values for the
401 @param measures A list with the values of all the measures for
404 words = [io.ReferenceFrameToText(ref).get_text()
for ref
in RFs]
405 RFs_txt = unit_delim.join(words)
406 record = [solution_id, assignment, RFs_txt, total_score] + \
408 if measures
is not None:
409 record = record + measures
410 self.records.append(record)
415 Add a record for the native structure to the database
416 see add_record() for the meaning of the parameters
418 words = [io.ReferenceFrameToText(ref).get_text()
for ref
in RFs]
419 RFs_txt = unit_delim.join(words)
421 record = [solution_id, assignment, RFs_txt, total_score] + \
424 record = record + measures
425 self.
store_data(self.native_table_name, [record])
427 def save_records(self, table="results"):
431 """ both distances and angles are expected to be a list of floats """
432 return [solution_id] + distances + angles
436 Creates a table to store the values of the placement scores for the
438 @param names Names of the components of the assembly
441 self.placement_table_name = self.placements_table
442 table_fields = [
"solution_id"]
443 table_fields += [
"distance_%s" % name
for name
in names]
444 table_fields += [
"angle_%s" % name
for name
in names]
445 table_types = [int] + [float
for f
in table_fields]
447 self.
create_table(self.placement_table_name, table_fields, table_types)
449 table_fields, table_types, check=
True)
451 native_values = [0
for t
in table_fields]
452 log.debug(
"%s", self.native_table_name)
453 log.debug(
"table fields %s", table_fields)
455 table_fields, native_values,
456 [
"assignment"], [
"\"native\""])
460 Return the names of the placement score fields in the database
464 col
for col
in columns
if "distance" in col
or "angle" in col]
469 Add a table to the database for store the values of the cross
470 correlation coefficient between a model and the native configuration
474 table_fields = [
"solution_id",
"ccc"]
475 table_types = [int, float]
477 self.
create_table(self.ccc_table_name, table_fields, table_types)
480 table_fields, table_types, check=
True)
482 table_fields, [0, 1.00], [
"assignment"], [
"\"native\""])
485 """ Format for the record to store in the ccc table """
486 return [solution_id, ccc]
490 Recover the cross-correlation coefficient for a solution
493 sql_command =
""" SELECT ccc FROM %s
494 WHERE solution_id=%d """ % (self.ccc_table_name,
499 def store_ccc_data(self, ccc_data):
500 self.
store_data(self.ccc_table_name, ccc_data)
502 def store_placement_data(self, data):
503 log.debug(
"store placement table %s", data)
504 self.
store_data(self.placement_table_name, data)
508 Format a left join SQL command that recovers all fileds from the
510 @param pairs_table_field Pairs of (table,field)
511 @param tables_names Names of the tables
513 E.g. If pairs_table_filed = ((table1,a), (table2,b), (table3,c),
514 (table2,d)) and tables_names = (table1, table2, table3)
517 SELECT table1.a, table2.b, table3.c, table2.d FROM table1
518 LEFT JOIN table2 ON table1.solution_id = table2.solution_id
519 LEFT JOIN table3 ON table1.solution_id = table3.solution_id
520 WHERE table1.solution_id IS NOT NULL AND
521 table2.solution_id IS NOT NULL AND
522 table3.solution_id IS NOT NULL
525 txt = [
"%s.%s" % (p[0], p[1])
for p
in pairs_table_field]
526 fields_requested = field_delim.join(txt)
527 sql_command =
" SELECT %s FROM %s " % (
528 fields_requested, tables_names[0])
529 n_tables = len(tables_names)
530 for i
in range(1, n_tables):
531 a = tables_names[i - 1]
533 sql_command +=
" LEFT JOIN %s " \
534 "ON %s.solution_id = %s.solution_id " % (b, a, b)
537 for i
in range(n_tables - 1):
538 sql_command +=
"WHERE %s.solution_id " \
539 "IS NOT NULL AND " % tables_names[
541 sql_command +=
" %s.solution_id IS NOT NULL " % tables_names[
543 log.debug(
"%s" % sql_command)
548 Add a table to store information about the clusters of structures
549 @param name Name of the table
551 self.cluster_table_name = name
553 table_fields = (
"cluster_id",
"n_elements",
554 "representative",
"elements",
"solutions_ids")
555 table_types = (int, int, int, str, str)
560 elements, solutions_ids):
562 Add a record to the cluster database. Actually, only stores it
563 in a list (that will be added later)
564 @param cluster_id Number with the id of the cluster
565 @param n_elements Number of elements in the cluster
566 @param representative Number with the id of the representative
568 @param elements List with the number of the elements of the cluster
569 @param solutions_ids The numbers above are provided by the
570 clustering algorithm. The solutions_ids are the ids of the models
574 record = (cluster_id, n_elements, representative, elements,
576 log.debug(
"Adding cluster record: %s", record)
577 self.cluster_records.append(record)
581 Store the data for the clusters
583 log.info(
"Storing data of clusters. Number of records %s",
584 len(self.cluster_records))
585 self.
store_data(self.cluster_table_name, self.cluster_records)
589 Recover solutions for a specific list of results
590 @param fields Fields to recover fro the database
591 @param solutions_ids A list with the desired solutions. E.g. [0,3,6]
593 sql_command =
""" SELECT %s FROM %s WHERE solution_id IN (%s) """
594 f = self.get_fields_string(fields)
595 str_ids =
",".join(map(str, solutions_ids))
598 (f, self.results_table, str_ids))
603 Get the position of the native configuration
604 @param orderby Criterium used to sort the solutions
609 native_value = data[0][0]
612 values = [row[0]
for row
in data]
613 rank = np.searchsorted(values, native_value)
618 Recover the the information about the n-th largest cluster
619 @param position Cluster position (by size) requested
620 (1 is the largest cluster)
621 @param table_name Table where the information about the
624 s =
""" SELECT * FROM %s ORDER BY n_elements DESC """ % table_name
631 Recovers from the database the placement scores for a set of
632 solutions, and returns the mean and standard deviation of the
633 placement score for each of the components of the complex being
634 scored. This function will be typical used to compute the variation
635 of the placement of each component within a cluster of solutions
636 @param solutions_ids The ids of the solutions used to compute
638 @return The output are 4 numpy vectors:
639 placement_distances_mean - The mean placement distance for each
641 placement_distances_stddev - The standardd deviation of the
642 placement distance for each component
643 placement_angles_mean - The mean placement angle for each
645 placement_angles_stddev - The standard deviation of the placement
646 angle for each component,
650 table = self.placements_table
652 distance_fields = filter(
lambda x:
'distance' in x, fields)
653 angle_fields = filter(
lambda x:
'angle' in x, fields)
654 sql_command =
""" SELECT %s FROM %s WHERE solution_id IN (%s) """
656 str_ids =
",".join(map(str, solutions_ids))
657 log.debug(
"Solutions considered %s", solutions_ids)
658 s = sql_command % (
",".join(distance_fields), table, str_ids)
660 s = sql_command % (
",".join(angle_fields), table, str_ids)
662 D = np.array(data_distances)
663 placement_distances_mean = D.mean(axis=0)
664 placement_distances_stddev = D.std(axis=0)
665 A = np.array(data_angles)
666 placement_angles_mean = A.mean(axis=0)
667 placement_angles_stddev = A.std(axis=0)
668 return [placement_distances_mean, placement_distances_stddev,
669 placement_angles_mean, placement_angles_stddev]
673 Calculate the placement score and its standard deviation for
674 the complexes in a set of solutions. The values returned are
675 averages, as the placement score for a complex is the average
676 of the placement scores of the components. This function is used
677 to obtain global placement for a cluster of solutions.
678 @param solutions_ids The ids of the solutions used to compute
680 @return The output are 4 values:
681 plcd_mean - Average of the placement distance for the entire
682 complex over all the solutions.
683 plcd_std - Standard deviation of the placement distance for
684 the entire complex over all the solutions.
685 plca_mean - Average of the placement angle for the entire
686 complex over all the solutions.
687 plca_std - Standard deviation of the placement angle for
688 the entire complex over all the solutions.
690 [placement_distances_mean, placement_distances_stddev,
691 placement_angles_mean, placement_angles_stddev] = \
693 plcd_mean = placement_distances_mean.mean(axis=0)
694 plcd_std = placement_distances_stddev.mean(axis=0)
695 plca_mean = placement_angles_mean.mean(axis=0)
696 plca_std = placement_angles_stddev.mean(axis=0)
697 return [plcd_mean, plcd_std, plca_mean, plca_std]
def format_placement_record
both distances and angles are expected to be a list of floats
def add_placement_scores_table
Creates a table to store the values of the placement scores for the models.
def check_if_is_connected
Checks if the class is connected to the database filename.
Utility functions to manage SQL databases with sqlite3.
def get_solutions
Get solutions from the database.
def get_solutions_results_table
Recovers solutions.
def get_best_solution
Recover the reference frame of the n-th best solution from a database.
def store_data
Inserts information in a given table of the database.
def get_sorting_indices
Return indices that sort the list l.
def retrieve_data
Retrieves data from the database using the sql_command returns the records as a list of tuples...
def get_table_column_names
Get the names of the columns for a given table.
def update_data
updates the register in the table identified by the condition values for the condition fields ...
def format_ccc_record
Format for the record to store in the ccc table.
def add_ccc_table
Add a table to the database for store the values of the cross correlation coefficient between a model...
def get_placement_statistics
Calculate the placement score and its standard deviation for the complexes in a set of solutions...
Simple named tuple class.
def add_results_table
Build the table of results.
def get_nth_largest_cluster
Recover the the information about the n-th largest cluster.
def add_columns
Add columns to the database.
def drop_table
Delete a table if it exists.
def get_solutions_from_list
Recover solutions for a specific list of results.
Class to manage a SQL database built with sqlite3.
def add_native_record
Add a record for the native structure to the database see add_record() for the meaning of the paramet...
def add_record
Add a recorde to the database.
def store_cluster_data
Store the data for the clusters.
def gather_solution_results
Reads a set of database files and puts them in a single file Makes sure to reorder all column names i...
Class for managing the results of the experiments.
def get_native_rank
Get the position of the native configuration.
def create_table
Creates a table.
def get_individual_placement_statistics
Recovers from the database the placement scores for a set of solutions, and returns the mean and stan...
def get_fields_string
Get a list of fields and return a string with them.
The heapq algorithm is a min-heap.
def get_native_solution
Recover data for the native solution.
def get_ccc
Recover the cross-correlation coefficient for a solution.
def get_left_join_command
Format a left join SQL command that recovers all fileds from the tables given.
def get_placement_fields
Return the names of the placement score fields in the database.
def add_clusters_table
Add a table to store information about the clusters of structures.
def add_cluster_record
Add a record to the cluster database.
def gather_best_solution_results
Reads a set of database files and merge them into a single file.