Skip to content

Commit

Permalink
Clustering dimensions (dials#2743)
Browse files Browse the repository at this point in the history
Improvements made to the clustering procedure used in dials.correlation_matrix:
 - cc_weights=sigma and weights=standard_error are now default for dataset clustering
 - number of dimensions required for cosine-angle clustering automatically optimised (also default behaviour) 
 - dimension optimisation output to log file 
 - when calculating the functional, additional outlier rejection added in dials.cosym for stability for cluster analysis only (not used for symmetry determination) 
---------
Co-authored-by: James Beilsten-Edmands <[email protected]>
  • Loading branch information
amyjaynethompson authored Oct 9, 2024
1 parent 1ad373b commit 7df425c
Show file tree
Hide file tree
Showing 7 changed files with 110 additions and 52 deletions.
1 change: 1 addition & 0 deletions newsfragments/2743.feature
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
``dials.correlation_matrix``: Add dimension optimisation for intensity-based dataset clustering
39 changes: 38 additions & 1 deletion src/dials/algorithms/correlation/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

import iotbx.phil
from dxtbx.model import ExperimentList
from libtbx import Auto
from libtbx.phil import scope_extract
from scitbx.array_family import flex

Expand Down Expand Up @@ -46,9 +47,27 @@
min_reflections = 10
.type = int(value_min=1)
.help = "The minimum number of reflections per experiment."
dimensionality_assessment {
outlier_rejection = True
.type = bool
.help = "Use outlier rejection when determining optimal dimensions for analysis."
maximum_dimensions = 50
.type = int
.help = "Maximum number of dimensions to test for reasonable processing time"
}
""",
process_includes=True,
)
phil_overrides = phil_scope.fetch(
source=iotbx.phil.parse(
"""\
cc_weights=sigma
weights=standard_error
"""
)
)
working_phil = phil_scope.fetch(sources=[phil_overrides])


class CorrelationMatrix:
Expand Down Expand Up @@ -122,6 +141,12 @@ def __init__(
self.params.lattice_group = self.datasets[0].space_group_info()
self.params.space_group = self.datasets[0].space_group_info()

# If dimensions are optimised for clustering, need cc_weights=sigma
# Otherwise results end up being nonsensical even for high-quality data
# Outlier rejection was also found to be beneficial for optimising clustering dimensionality
if self.params.dimensions is Auto and self.params.cc_weights != "sigma":
raise ValueError("To optimise dimensions, cc_weights=sigma is required.")

self.cosym_analysis = CosymAnalysis(self.datasets, self.params)

def _merge_intensities(self, datasets: list) -> list:
Expand Down Expand Up @@ -182,7 +207,19 @@ def calculate_matrices(self):
self.cosym_analysis._intialise_target()

# Cosym proceedures to calculate the cos-angle matrix
self.cosym_analysis._determine_dimensions()
if (
len(self.unmerged_datasets)
<= self.params.dimensionality_assessment.maximum_dimensions
):
dims_to_test = len(self.unmerged_datasets)
else:
dims_to_test = self.params.dimensionality_assessment.maximum_dimensions

if self.params.dimensions is Auto:
self.cosym_analysis._determine_dimensions(
dims_to_test,
outlier_rejection=self.params.dimensionality_assessment.outlier_rejection,
)
self.cosym_analysis._optimise(
self.cosym_analysis.params.minimization.engine,
max_iterations=self.cosym_analysis.params.minimization.max_iterations,
Expand Down
98 changes: 55 additions & 43 deletions src/dials/algorithms/symmetry/cosym/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,9 @@
phil_scope = iotbx.phil.parse(
"""\
seed = 230
.type = int(value_min=0)
normalisation = kernel quasi *ml_iso ml_aniso
.type = choice
Expand Down Expand Up @@ -265,65 +268,73 @@ def _intialise_target(self):
nproc=self.params.nproc,
)

def _determine_dimensions(self):
if self.params.dimensions is Auto and self.target.dim == 2:
self.params.dimensions = 2
elif self.params.dimensions is Auto:
logger.info("=" * 80)
logger.info(
"\nAutomatic determination of number of dimensions for analysis"
def _determine_dimensions(self, dims_to_test, outlier_rejection=False):
logger.info("=" * 80)
logger.info("\nAutomatic determination of number of dimensions for analysis")
dimensions = []
functional = []
for dim in range(1, dims_to_test + 1):
logger.debug("Testing dimension: %i", dim)
self.target.set_dimensions(dim)
max_calls = self.params.minimization.max_calls
self._optimise(
self.params.minimization.engine,
max_iterations=self.params.minimization.max_iterations,
max_calls=min(20, max_calls) if max_calls else max_calls,
)
dimensions = []
functional = []
for dim in range(1, self.target.dim + 1):
logger.debug("Testing dimension: %i", dim)
self.target.set_dimensions(dim)
max_calls = self.params.minimization.max_calls
self._optimise(
self.params.minimization.engine,
max_iterations=self.params.minimization.max_iterations,
max_calls=min(20, max_calls) if max_calls else max_calls,

dimensions.append(dim)
functional.append(
self.target.compute_functional_score_for_dimension_assessment(
self.minimizer.x, outlier_rejection
)
dimensions.append(dim)
functional.append(self.minimizer.fun)
)

# Find the elbow point of the curve, in the same manner as that used by
# distl spotfinder for resolution method 1 (Zhang et al 2006).
# See also dials/algorithms/spot_finding/per_image_analysis.py

# Find the elbow point of the curve, in the same manner as that used by
# distl spotfinder for resolution method 1 (Zhang et al 2006).
# See also dials/algorithms/spot_finding/per_image_analysis.py
x = np.array(dimensions)
y = np.array(functional)
slopes = (y[-1] - y[:-1]) / (x[-1] - x[:-1])
p_m = slopes.argmin()

x = np.array(dimensions)
y = np.array(functional)
slopes = (y[-1] - y[:-1]) / (x[-1] - x[:-1])
p_m = slopes.argmin()
x1 = matrix.col((x[p_m], y[p_m]))
x2 = matrix.col((x[-1], y[-1]))

x1 = matrix.col((x[p_m], y[p_m]))
x2 = matrix.col((x[-1], y[-1]))
gaps = []
v = matrix.col(((x2[1] - x1[1]), -(x2[0] - x1[0]))).normalize()

gaps = []
v = matrix.col(((x2[1] - x1[1]), -(x2[0] - x1[0]))).normalize()
for i in range(p_m, len(x)):
x0 = matrix.col((x[i], y[i]))
r = x1 - x0
g = abs(v.dot(r))
gaps.append(g)

for i in range(p_m, len(x)):
x0 = matrix.col((x[i], y[i]))
r = x1 - x0
g = abs(v.dot(r))
gaps.append(g)
p_g = np.array(gaps).argmax()

p_g = np.array(gaps).argmax()
x_g = x[p_g + p_m]

x_g = x[p_g + p_m]
logger.info(
dials.util.tabulate(
zip(dimensions, functional), headers=("Dimensions", "Functional")
)
)

logger.info("Best number of dimensions: %i", x_g)
if int(x_g) < 2:
logger.info(
dials.util.tabulate(
zip(dimensions, functional), headers=("Dimensions", "Functional")
)
"As a minimum of 2-dimensions is required, dimensions have been set to 2."
)
logger.info("Best number of dimensions: %i", x_g)
self.target.set_dimensions(2)
else:
self.target.set_dimensions(int(x_g))
logger.info("Using %i dimensions for analysis", self.target.dim)
logger.info("Using %i dimensions for analysis", self.target.dim)

def run(self):
self._intialise_target()
self._determine_dimensions()
if self.params.dimensions is Auto and self.target.dim != 2:
self._determine_dimensions(self.target.dim)
self._optimise(
self.params.minimization.engine,
max_iterations=self.params.minimization.max_iterations,
Expand All @@ -335,6 +346,7 @@ def run(self):

@Subject.notify_event(event="optimised")
def _optimise(self, engine, max_iterations=None, max_calls=None):
np.random.seed(self.params.seed)
NN = len(set(self.dataset_ids))
n_sym_ops = len(self.target.sym_ops)

Expand Down
14 changes: 14 additions & 0 deletions src/dials/algorithms/symmetry/cosym/target.py
Original file line number Diff line number Diff line change
Expand Up @@ -473,6 +473,20 @@ def compute_functional(self, x: np.ndarray) -> float:
f = 0.5 * elements.sum()
return f

def compute_functional_score_for_dimension_assessment(
self, x: np.ndarray, outlier_rejection: bool = True
) -> float:
if not outlier_rejection:
return self.compute_functional(x)
x = x.reshape((self.dim, x.size // self.dim))
elements = np.square(self.rij_matrix - x.T @ x)
if self.wij_matrix is not None:
np.multiply(self.wij_matrix, elements, out=elements)

q1, q2, q3 = np.quantile(elements, (0.25, 0.5, 0.75))
inliers = elements[elements < q2 + (q3 - q1)]
return 0.5 * inliers.sum()

def compute_gradients_fd(self, x: np.ndarray, eps=1e-6) -> np.ndarray:
"""Compute the gradients at coordinates `x` using finite differences.
Expand Down
5 changes: 1 addition & 4 deletions src/dials/command_line/correlation_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,7 @@

phil_scope = iotbx.phil.parse(
"""\
include scope dials.algorithms.correlation.analysis.phil_scope
seed = 42
.type = int(value_min=0)
include scope dials.algorithms.correlation.analysis.working_phil
output {
log = dials.correlation_matrix.log
Expand Down
3 changes: 0 additions & 3 deletions src/dials/command_line/cosym.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,9 +75,6 @@
.type = int(value_min=1)
.help = "The minimum number of reflections per experiment."
seed = 230
.type = int(value_min=0)
output {
suffix = "_reindexed"
.type = str
Expand Down
2 changes: 1 addition & 1 deletion tests/algorithms/correlation/test_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def test_filtered_corr_mat(proteinase_k, run_in_tmp_path):
matrices.output_json()
assert pathlib.Path("dials.correlation_matrix.json").is_file()

expected_ids = [[1, 3], [0, 1, 3]]
expected_ids = [[0, 3], [0, 1, 3]]

# Check main algorithm correct with filtering
for i, j in zip(matrices.correlation_clusters, expected_ids):
Expand Down

0 comments on commit 7df425c

Please sign in to comment.