1 """@namespace IMP.em2d.solutions_io
2 Utility functions to store and retrieve solution information.
5 import IMP.em2d.imp_general.io
as io
21 from sets
import Set
as set
23 log = logging.getLogger(
"solutions_io")
29 """Simple named tuple class"""
31 class _itemgetter(object):
32 def __init__(self, ind):
34 def __call__(self, obj):
35 return obj[self.__ind]
37 def __init__(self, iterable):
38 if len(iterable) != self.__n_fields:
39 raise TypeError(
"Expected %d arguments, got %d" \
40 % (self.__n_fields, len(iterable)))
41 tuple.__init__(self, iterable)
44 cluster_id = property(_itemgetter(0))
45 n_elements = property(_itemgetter(1))
46 representative = property(_itemgetter(2))
47 elements = property(_itemgetter(3))
48 solutions_ids = property(_itemgetter(4))
59 The heapq algorithm is a min-heap. I want a max-heap, that pops the
60 larger values out of the heap.
61 For that I have to modify the comparison function and also set the
62 index that is used for the comparison. The index corresponds to
63 the restraint that we desired to order by
65 def __new__(self,x,i):
67 Build from a tuple and the index used to compare
70 return tuple.__new__(self, x)
72 def __lt__(self, other):
74 Compare. To convert the min-heap into a max-heap, the lower than
75 comparison is transformed into a greater-than
78 if(self[i] > other[i]):
83 def __le__(self, other):
85 return self[i] >= other[i]
89 raisef=0.1, orderby=
"em2d"):
91 Reads a set of database files and merge them into a single file.
93 @param fns List of files with databases
94 @param fn_output The database to create
95 @param max_number Maximum number of records to keep, sorted according
97 @param raisef Ratio of problematic database files tolerated before
98 raising an error. This option is to tolerate some files
99 of the databases being broken because the cluster fails,
101 @param orderby Criterium used to sort the the records
103 Makes sure to reorder all column names if neccesary before merging
104 The record for the native solution is only added once (from first file).
110 names = db.get_table_column_names(tbl)
111 types = db.get_table_types(tbl)
113 sorted_names = [ names[i]
for i
in indices]
114 sorted_types = [ types[i]
for i
in indices]
117 ind = names.index(orderby)
118 they_are_sorted = field_delim.join(names)
120 sql_command =
"""SELECT %s FROM %s
121 WHERE assignment="native" LIMIT 1 """ % (they_are_sorted, tbl)
122 native_data = db.retrieve_data(sql_command)
124 log.info(
"Gathering results. Saving to %s", fn_output)
126 out_db.create(fn_output, overwrite=
True)
127 out_db.connect(fn_output)
128 out_db.create_table(tbl, sorted_names, sorted_types)
134 log.info(
"Reading %s",fn)
137 sql_command =
"""SELECT %s FROM %s
138 WHERE assignment<>"native"
139 ORDER BY %s ASC LIMIT %s """ % (
140 they_are_sorted, tbl,orderby, max_number)
141 data = db.retrieve_data(sql_command)
142 log.info(
"%s records read from %s",len(data), fn)
147 if(len(best_records) < max_number):
148 heapq.heappush(best_records, a)
152 if(best_records[0] < a):
153 heapq.heapreplace(best_records, a)
155 log.error(
"Error for %s: %s",fn, e)
161 ratio = float(n_problems)/float(len(fns))
163 raise IOError(
"There are %8.1f %s of the database "\
164 "files to merge with problems! " % (ratio*100,
"%"))
166 heapq.heappush(best_records, native_data[0])
167 out_db.store_data(tbl, best_records)
172 Reads a set of database files and puts them in a single file
173 Makes sure to reorder all column names if neccesary before merging
174 @param fns List of database files
175 @param fn_output Name of the output database
176 @param raisef See help for gather_best_solution_results()
182 names = db.get_table_column_names(tbl)
183 types = db.get_table_types(tbl)
185 sorted_names = [ names[i]
for i
in indices]
186 sorted_types = [ types[i]
for i
in indices]
187 log.info(
"Gathering results. Saving to %s", fn_output)
189 out_db.create(fn_output, overwrite=
True)
190 out_db.connect(fn_output)
191 out_db.create_table(tbl, sorted_names, sorted_types)
196 log.info(
"Reading %s",fn)
198 names = db.get_table_column_names(tbl)
200 they_are_sorted = field_delim.join(names)
201 log.debug(
"Retrieving %s", they_are_sorted)
202 sql_command =
"SELECT %s FROM %s" % (they_are_sorted, tbl)
203 data = db.retrieve_data(sql_command)
204 out_db.store_data(tbl, data)
207 log.error(
"Error for file %s: %s",fn, e)
209 ratio = float(n_problems)/float(len(fns))
211 raise IOError(
"There are %8.1f %s of the database "\
212 "files to merge with problems! " % (ratio*100,
"%"))
216 """ Return indices that sort the list l """
217 pairs = [(element, i)
for i,element
in enumerate(l)]
219 indices = [p[1]
for p
in pairs]
225 Recover the reference frame of the n-th best solution from a database.
226 The index Nth stars at 0
229 sql_command =
""" SELECT %s FROM %s
231 ASC LIMIT 1 OFFSET %d """ % (f, tbl, orderby, Nth)
232 data = Database.read_data(fn_database, sql_command)
234 raise ValueError(
"The requested %s-th best solution does not exist. "\
235 "Only %s solutions found" % (Nth, len(data) ))
239 def get_pca(string, delimiter="/"):
240 pca = string.split(delimiter)
241 pca = [float(p)
for p
in pca]
246 Get a list of fields and return a string with them. If there are no
247 fields, return an *, indicating SQL that all the fields are requested
248 @param fields A list of strings
253 return field_delim.join(fields)
259 Class for managing the results of the experiments
261 def __init__(self, ):
263 self.native_table_name =
"native"
264 self.results_table =
"results"
265 self.placements_table =
"placements"
266 self.ccc_table_name =
"ccc"
267 self.cluster_records = []
270 self.results_description_columns = [
"solution_id",
"assignment",
272 self.results_description_types = [int, str, str]
274 self.results_measures_columns = [
"drms",
"cdrms",
"crmsd"]
275 self.results_measures_types = [float, float, float]
279 Build the table of results
280 @param restraints_names The names given to the columns of the table
281 @param add_measures If True, add fields for comparing models
282 and native conformation
284 table_fields = self.results_description_columns + \
285 [
"total_score"] + restraints_names
286 table_types = self.results_description_types + \
287 [float] + [float
for r
in restraints_names]
290 table_fields += self.results_measures_columns
291 table_types += self.results_measures_types
292 log.debug(
"Creating table %s\n%s",table_fields,table_types)
293 self.
create_table(self.results_table, table_fields, table_types)
296 self.
create_table(self.native_table_name, table_fields, table_types)
299 max_number=
None, orderby=
False):
302 @param fields Fields to recover from the table
303 @param max_number Maximum number of solutions to recover
304 @param orderby Name of the restraint used for sorting the states
307 log.info(
"Getting %s from solutions", fields)
308 f = self.get_fields_string(fields)
309 sql_command =
"SELECT %s FROM %s " % (f, self.results_table)
311 sql_command +=
" ORDER BY %s ASC" % orderby
312 if max_number
not in (
None,
False):
313 sql_command +=
" LIMIT %d" % (max_number)
314 log.debug(
"Using %s", sql_command )
320 Get solutions from the database.
321 @param fields Fields requested. If the fields are in different
322 tables, a left join is done. Otherwise get_solutions_results_table()
323 is called. See get_solutions_results_table() for the meaning
328 tables = self.get_tables_names()
329 log.debug(
"tables %s", tables)
330 required_tables = set()
331 pairs_table_field = []
335 for f,t
in [(f,t)
for f
in fields
for t
in tables]:
336 if t ==
"native" or f ==
"solution_id":
340 required_tables.add(t)
341 pairs_table_field.append((t,f))
342 required_tables = list(required_tables)
343 log.debug(
"required_tables %s", required_tables)
344 log.debug(
"pairs_table_field %s", pairs_table_field)
345 if len(required_tables) == 0:
349 elif len(required_tables) == 1
and required_tables[0] ==
"results":
353 elif len(required_tables) > 1:
357 sql_command +=
" ORDER BY %s ASC" % orderby
358 log.debug(
"Using %s", sql_command )
362 raise ValueError(
"Fields not found in the database")
366 Recover data for the native solution
367 @param fields Fields to recover
370 f = self.get_fields_string(fields)
371 sql_command =
"SELECT %s FROM %s " % (f, self.native_table_name)
375 def add_record(self, solution_id, assignment, RFs, total_score,
376 restraints_scores, measures):
378 Add a recorde to the database
379 @param solution_id The key for the solution
380 @param assignment The assigment for the solution provided by
382 @param RFs Reference frames of the rigid bodies of the components
383 of the assembly in the solution
384 @param total_score Total value of the scoring function
385 @param restraints_scores A list with all the values for the
387 @param measures A list with the values of all the measures for
390 words = [io.ReferenceFrameToText(ref).get_text()
for ref
in RFs]
391 RFs_txt = unit_delim.join(words)
392 record = [solution_id, assignment, RFs_txt, total_score] + \
395 record = record + measures
396 self.records.append(record)
401 Add a record for the native structure to the database
402 see add_record() for the meaning of the parameters
404 words = [io.ReferenceFrameToText(ref).get_text()
for ref
in RFs]
405 RFs_txt = unit_delim.join(words)
407 record = [solution_id, assignment, RFs_txt, total_score] + \
410 record = record + measures
411 self.
store_data(self.native_table_name, [record])
413 def save_records(self,table="results"):
417 """ both distances and angles are expected to be a list of floats """
418 return [solution_id] + distances + angles
423 Creates a table to store the values of the placement scores for the
425 @param names Names of the components of the assembly
428 self.placement_table_name = self.placements_table
429 table_fields = [
"solution_id"]
430 table_fields += [
"distance_%s" % name
for name
in names]
431 table_fields += [
"angle_%s" % name
for name
in names]
432 table_types = [int] + [float
for f
in table_fields]
434 self.
create_table(self.placement_table_name, table_fields, table_types)
436 table_fields, table_types,check=
True)
438 native_values = [0
for t
in table_fields]
439 log.debug(
"%s", self.native_table_name)
440 log.debug(
"table fields %s", table_fields)
442 table_fields, native_values,
443 [
"assignment"], [
"\"native\""])
447 Return the names of the placement score fields in the database
450 fields = [col
for col
in columns
if "distance" in col
or "angle" in col]
455 Add a table to the database for store the values of the cross
456 correlation coefficient between a model and the native configuration
460 table_fields = [
"solution_id",
"ccc"]
461 table_types = [int, float]
463 self.
create_table(self.ccc_table_name, table_fields, table_types)
466 table_fields, table_types,check=
True)
468 table_fields, [0,1.00], [
"assignment"], [
"\"native\""])
471 """ Format for the record to store in the ccc table """
472 return [solution_id, ccc]
476 Recover the cross-correlation coefficient for a solution
479 sql_command =
""" SELECT ccc FROM %s
480 WHERE solution_id=%d """ % (self.ccc_table_name,
485 def store_ccc_data(self, ccc_data):
486 self.
store_data(self.ccc_table_name, ccc_data)
488 def store_placement_data(self, data):
489 log.debug(
"store placement table %s",data)
490 self.
store_data(self.placement_table_name,data)
494 Format a left join SQL command that recovers all fileds from the
496 @param pairs_table_field Pairs of (table,field)
497 @param tables_names Names of the tables
499 E.g. If pairs_table_filed = ((table1,a), (table2,b), (table3,c),
500 (table2,d)) and tables_names = (table1, table2, table3)
503 SELECT table1.a, table2.b, table3.c, table2.d FROM table1
504 LEFT JOIN table2 ON table1.solution_id = table2.solution_id
505 LEFT JOIN table3 ON table1.solution_id = table3.solution_id
506 WHERE table1.solution_id IS NOT NULL AND
507 table2.solution_id IS NOT NULL AND
508 table3.solution_id IS NOT NULL
511 txt = [
"%s.%s" % (p[0],p[1])
for p
in pairs_table_field]
512 fields_requested = field_delim.join(txt)
513 sql_command =
" SELECT %s FROM %s " % (fields_requested,tables_names[0])
514 n_tables = len(tables_names)
515 for i
in range(1, n_tables):
516 a = tables_names[i-1]
518 sql_command +=
" LEFT JOIN %s " \
519 "ON %s.solution_id = %s.solution_id " % (b,a,b)
522 for i
in range(n_tables-1):
523 sql_command +=
"WHERE %s.solution_id " \
524 "IS NOT NULL AND " % tables_names[i]
525 sql_command +=
" %s.solution_id IS NOT NULL " % tables_names[n_tables-1]
526 log.debug(
"%s" %sql_command)
531 Add a table to store information about the clusters of structures
532 @param name Name of the table
534 self.cluster_table_name = name
536 table_fields = (
"cluster_id",
"n_elements",
537 "representative",
"elements",
"solutions_ids")
538 table_types = (int, int, int, str, str)
543 elements, solutions_ids):
545 Add a record to the cluster database. Actually, only stores it
546 in a list (that will be added later)
547 @param cluster_id Number with the id of the cluster
548 @param n_elements Number of elements in the cluster
549 @param representative Number with the id of the representative
551 @param elements List with the number of the elements of the cluster
552 @param solutions_ids The numbers above are provided by the
553 clustering algorithm. The solutions_ids are the ids of the models
557 record = (cluster_id, n_elements, representative, elements,
559 log.debug(
"Adding cluster record: %s", record)
560 self.cluster_records.append(record)
564 Store the data for the clusters
566 log.info(
"Storing data of clusters. Number of records %s",
567 len(self.cluster_records) )
568 self.
store_data(self.cluster_table_name, self.cluster_records)
572 Recover solutions for a specific list of results
573 @param fields Fields to recover fro the database
574 @param solutions_ids A list with the desired solutions. E.g. [0,3,6]
576 sql_command =
""" SELECT %s FROM %s WHERE solution_id IN (%s) """
577 f = self.get_fields_string(fields)
578 str_ids =
",".join(map(str,solutions_ids))
579 data = self.
retrieve_data( sql_command % (f, self.results_table, str_ids ) )
584 Get the position of the native configuration
585 @param orderby Criterium used to sort the solutions
590 native_value = data[0][0]
593 values = [row[0]
for row
in data]
594 rank = np.searchsorted(values,native_value)
599 Recover the the information about the n-th largest cluster
600 @param position Cluster position (by size) requested
601 (1 is the largest cluster)
602 @param table_name Table where the information about the
605 s =
""" SELECT * FROM %s ORDER BY n_elements DESC """ % table_name
613 Recovers from the database the placement scores for a set of
614 solutions, and returns the mean and standard deviation of the
615 placement score for each of the components of the complex being
616 scored. This function will be typical used to compute the variation
617 of the placement of each component within a cluster of solutions
618 @param solutions_ids The ids of the solutions used to compute
620 @return The output are 4 numpy vectors:
621 placement_distances_mean - The mean placement distance for each
623 placement_distances_stddev - The standardd deviation of the
624 placement distance for each component
625 placement_angles_mean - The mean placement angle for each
627 placement_angles_stddev - The standard deviation of the placement
628 angle for each component,
632 table = self.placements_table
634 distance_fields = filter(
lambda x:
'distance' in x, fields)
635 angle_fields = filter(
lambda x:
'angle' in x, fields)
636 sql_command =
""" SELECT %s FROM %s WHERE solution_id IN (%s) """
638 str_ids =
",".join(map(str,solutions_ids))
639 log.debug(
"Solutions considered %s", solutions_ids)
640 s = sql_command % (
",".join(distance_fields), table, str_ids )
642 s = sql_command % (
",".join(angle_fields), table, str_ids )
644 D = np.array(data_distances)
645 placement_distances_mean = D.mean(axis=0)
646 placement_distances_stddev = D.std(axis=0)
647 A = np.array(data_angles)
648 placement_angles_mean = A.mean(axis=0)
649 placement_angles_stddev = A.std(axis=0)
650 return [placement_distances_mean,placement_distances_stddev,
651 placement_angles_mean, placement_angles_stddev]
656 Calculate the placement score and its standard deviation for
657 the complexes in a set of solutions. The values returned are
658 averages, as the placement score for a complex is the average
659 of the placement scores of the components. This function is used
660 to obtain global placement for a cluster of solutions.
661 @param solutions_ids The ids of the solutions used to compute
663 @return The output are 4 values:
664 plcd_mean - Average of the placement distance for the entire
665 complex over all the solutions.
666 plcd_std - Standard deviation of the placement distance for
667 the entire complex over all the solutions.
668 plca_mean - Average of the placement angle for the entire
669 complex over all the solutions.
670 plca_std - Standard deviation of the placement angle for
671 the entire complex over all the solutions.
673 [placement_distances_mean,placement_distances_stddev,
674 placement_angles_mean, placement_angles_stddev] = \
676 plcd_mean = placement_distances_mean.mean(axis=0)
677 plcd_std = placement_distances_stddev.mean(axis=0)
678 plca_mean = placement_angles_mean.mean(axis=0)
679 plca_std = placement_angles_stddev.mean(axis=0)
680 return [plcd_mean, plcd_std, plca_mean, plca_std]
def format_placement_record
both distances and angles are expected to be a list of floats
def add_placement_scores_table
Creates a table to store the values of the placement scores for the models.
def check_if_is_connected
Checks if the class is connected to the database filename.
Utility functions to manage SQL databases with sqlite3.
def get_solutions
Get solutions from the database.
def get_solutions_results_table
Recovers solutions.
def get_best_solution
Recover the reference frame of the n-th best solution from a database.
def store_data
Inserts information in a given table of the database.
def get_sorting_indices
Return indices that sort the list l.
def retrieve_data
Retrieves data from the database using the sql_command returns the records as a list of tuples...
def get_table_column_names
Get the names of the columns for a given table.
def update_data
updates the register in the table identified by the condition values for the condition fields ...
def format_ccc_record
Format for the record to store in the ccc table.
def add_ccc_table
Add a table to the database for store the values of the cross correlation coefficient between a model...
def get_placement_statistics
Calculate the placement score and its standard deviation for the complexes in a set of solutions...
Simple named tuple class.
def add_results_table
Build the table of results.
def get_nth_largest_cluster
Recover the the information about the n-th largest cluster.
def add_columns
Add columns to the database.
def drop_table
Delete a table if it exists.
def get_solutions_from_list
Recover solutions for a specific list of results.
Class to manage a SQL database built with sqlite3.
def add_native_record
Add a record for the native structure to the database see add_record() for the meaning of the paramet...
def add_record
Add a recorde to the database.
def store_cluster_data
Store the data for the clusters.
def gather_solution_results
Reads a set of database files and puts them in a single file Makes sure to reorder all column names i...
Class for managing the results of the experiments.
def get_native_rank
Get the position of the native configuration.
def create_table
Creates a table.
def get_individual_placement_statistics
Recovers from the database the placement scores for a set of solutions, and returns the mean and stan...
def get_fields_string
Get a list of fields and return a string with them.
The heapq algorithm is a min-heap.
def get_native_solution
Recover data for the native solution.
def get_ccc
Recover the cross-correlation coefficient for a solution.
def get_left_join_command
Format a left join SQL command that recovers all fileds from the tables given.
def get_placement_fields
Return the names of the placement score fields in the database.
def add_clusters_table
Add a table to store information about the clusters of structures.
def add_cluster_record
Add a record to the cluster database.
def gather_best_solution_results
Reads a set of database files and merge them into a single file.