IMP logo
IMP Reference Guide  develop.50fdd7fa33,2025/09/05
The Integrative Modeling Platform
normalize_map_for_parts_fitting.py
1 # Script to perform normalization on an EM map
2 # by histogram matching to a reference
3 
4 import IMP
5 import IMP.em
6 import numpy as np
7 from statsmodels.distributions.empirical_distribution import ECDF
8 from statsmodels.distributions.empirical_distribution import StepFunction
9 import argparse
10 import os
11 from . import config
12 from . import tools
13 
14 
15 def compute_cdf(dmap, exclude_zero=True):
16  # Returns an empirical function for the CDF of voxel intensities
17  # If exclude_zero = True, then ignore zeros (as they are cropped pixels)
18  vals = []
19  for i in range(dmap.get_number_of_voxels()):
20  val = dmap.get_value(i)
21  if not exclude_zero or val != 0:
22  vals.append(val)
23  if len(vals) == 0:
24  raise ValueError("The input density map contains no valid "
25  "voxel values.")
26  f = ECDF(vals) # Return the Empirical CDF of an array as a step function
27  return f
28 
29 
30 def compute_inverse_cdf(dmap, rn=(0, 1), exclude_zero=True):
31  # Returns a function that maps a CDF to the intensity value
32  # f = compute_cdf(exclude_zero=exclude_zero)
33  f = compute_cdf(dmap, exclude_zero=exclude_zero)
34  x = np.arange(rn[0], rn[1], 0.0001)
35  y = f(x)
36  fi = StepFunction(y, x)
37  return fi
38 
39 
40 def compare_stats_hist(ref, exp):
41  # Function to debug
42  expcdf = compute_cdf(exp)
43  refinvcdf = compute_inverse_cdf(ref)
44  refcdf = compute_cdf(ref)
45  emax = exp.get_max_value()
46  emin = exp.get_min_value()
47  print("Val ecdf refval refcdf")
48  for i in np.arange(emin, emax, 0.01):
49  cc = expcdf(i)
50  refval = refinvcdf(cc)
51  print(i, cc, refval, refinvcdf(refval))
52 
53  print("Val ecdf refval refcdf")
54  for i in np.arange(0, 1, 0.01):
55  print(i, refcdf(i), refinvcdf(i))
56 
57 
58 def convert_map_to_ndarray(dmap):
59  vals = []
60  for i in range(dmap.get_number_of_voxels()):
61  val = dmap.get_value(i)
62  vals.append(val)
63  map_array = np.array(vals)
64  return map_array
65 
66 
67 def hist_match(source_map, template_map):
68  """
69  Adjust the pixel values of a Map such that its histogram
70  matches that of a target Map
71 
72  Arguments:
73  -----------
74  source: IMP.map object
75  Map to transform; the histogram is computed over the flattened
76  array
77  template: IMP.map object
78  Template map; can have different dimensions to source
79  """
80 
81  source = convert_map_to_ndarray(source_map)
82  template = convert_map_to_ndarray(template_map)
83 
84  # get the set of unique pixel values and their corresponding indices and
85  # counts
86  s_values, bin_idx, s_counts = np.unique(source, return_inverse=True,
87  return_counts=True)
88  t_values, t_counts = np.unique(template, return_counts=True)
89 
90  # take the cumsum of the counts and normalize by the number of pixels to
91  # get the empirical cumulative distribution functions for the source and
92  # template maps (maps pixel value --> quantile)
93  s_quantiles = np.cumsum(s_counts).astype(np.float64)
94  s_quantiles /= s_quantiles[-1]
95  t_quantiles = np.cumsum(t_counts).astype(np.float64)
96  t_quantiles /= t_quantiles[-1]
97 
98  of = open("hmatch_new.dat", "w")
99 
100  for i in range(source_map.get_number_of_voxels()):
101  expval = source_map.get_value(i)
102  if expval != 0:
103  id_ = np.where(s_values == expval)
104 # print(s_quantiles[id_])
105  # interpolate linearly to find the pixel values in the template map
106  # that correspond most closely to the quantiles in the source map
107  interp_t_value = np.interp(s_quantiles[id_], t_quantiles, t_values)
108  of.write(str(i)+" "+str(expval)+" "+str(interp_t_value[0])+"\n")
109  source_map.set_value(i, interp_t_value)
110 
111 
112 def histogram_match_map(ref, exp, em_map, df=None):
113  '''
114  Match the histogram of exp to ref
115  '''
116  # First, compute CDF for both ref and exp:
117  expcdf = compute_cdf(exp) # compute the CDF for experimental map
118  # compute a mapping function that maps between the frequencies to
119  # pixel values for the ref map
120  refinvcdf = compute_inverse_cdf(ref)
121  refcdf = compute_cdf(ref) # compute the CDF for the ref map
122 
123  if df is not None:
124  of = open(os.path.join(os.path.dirname(em_map), "hmatch.dat"), "w")
125  for i in range(exp.get_number_of_voxels()):
126  expval = exp.get_value(i)
127  # get how frequent that pixel value (expval) occurs in
128  # the experimental map
129  expc = expcdf(expval)
130  # get what pixel value occurs with same frequency in the ref map
131  refval = refinvcdf(expc)
132  if expval != 0:
133  if df is not None:
134  of.write(str(i)+" "+str(expval)+" "+str(expc)+" "
135  + str(refval)+" "+str(refcdf(refval))+"\n")
136  exp.set_value(i, refval)
137 
138 
139 def get_percentile_of_value(dmap, value, thresh=-100000):
140 
141  # For the given value, compute its percentile over the values
142  # in the density map
143 
144  dmap_vals = []
145  for i in range(dmap.get_number_of_voxels()):
146  val = dmap.get_value(i)
147  if val > thresh:
148  dmap_vals.append(val)
149  pct = (np.array(dmap_vals) < value).sum() / len(dmap_vals)
150  return pct
151 
152 
153 parser = argparse.ArgumentParser(
154  description='Given an EMDB id, normalize the density map using '
155  'histogram matching')
156 parser.add_argument('emdb', metavar='emdb', type=str,
157  help='Provide an EMDB id')
158 parser.add_argument('--thresh', metavar='thresh', type=str,
159  help='map threshold value')
160 parser.add_argument(
161  "--database_home", dest="database_home", type=str,
162  help="Directory containing all data files used in the protocol",
163  default=".")
164 parser.add_argument(
165  "--reference_map", dest="reference_map", type=str,
166  help="reference map for voxel data extraction",
167  default="reference/ref.mrc")
168 
169 args = parser.parse_args()
170 curr_dir = os.getcwd()
171 emdb = args.emdb
172 
173 # Filepaths
174 em_map = os.path.join(args.database_home, str(emdb), "0system", "emdb.map")
175 norm_em_map = os.path.join(args.database_home, str(emdb), "0system",
176  "emdb_normalized_new.map")
177 pdb_file = os.path.join(args.database_home, str(emdb), "0system",
178  "native.pdb")
179 data_file = os.path.join(args.database_home, str(emdb), "0system",
180  "normalization_new.dat")
181 
182 # If there's no EM map, then there's nothing to do!
183 if not os.path.exists(em_map):
184  print("NO em map; check path", em_map)
185  exit()
186 
187 df = open(data_file, "w")
188 df.write("voxel exp_val exp_cdf ref_val ref_cdf")
189 
190 dmap = IMP.em.read_map(em_map, IMP.em.MRCReaderWriter())
191 ref_dmap = IMP.em.read_map(args.reference_map, IMP.em.MRCReaderWriter())
192 
193 # Get the coordinates of the PDB Calpha atoms
194 m = IMP.Model()
196 particles = IMP.atom.Selection(rh).get_selected_particles()
197 
198 # Crop map coordinates greater than 14 angstroms from all CA atoms
199 dmap_c = dmap.get_cropped(particles, 14.0)
200 
201 try:
202  xml = config.get_xml(emdb)
203  thresh = tools.get_threshold_from_xml(xml)
204 except: # noqa: E722
205  mass = IMP.atom.get_mass_from_number_of_residues(len(particles))
206  thresh = float(args.thresh)
207  print("thresh value {} for the emdb {}".format(str(thresh), emdb))
208 
209 # Then, threshold the map at the user-defined value
210 dmap_t = IMP.em.get_threshold_map(dmap_c, thresh)
211 
212 # Match histogram of thresholded map to reference
213 histogram_match_map(ref_dmap, dmap_t, em_map, df=df)
214 
215 # Write out normalized map
216 IMP.em.write_map(dmap_t, norm_em_map, IMP.em.MRCReaderWriter())
double get_mass_from_number_of_residues(unsigned int num_aa)
Estimate the mass of a protein from the number of amino acids.
DensityMap * get_threshold_map(const DensityMap *orig_map, float threshold)
Return a map with 0 for all voxels below the threshold.
void read_pdb(TextInput input, int model, Hierarchy h)
Class for storing model, its restraints, constraints, and particles.
Definition: Model.h:86
Basic utilities for handling cryo-electron microscopy 3D density maps.
Select all CA ATOM records.
Definition: pdb.h:142
Select hierarchy particles identified by the biological name.
Definition: Selection.h:70