FIX: Add explicit support and tests for subsystems (#118)

If a user wants to fit a subsystem of their database, this code and tests ensures that everything still works. In summary: * Activity data works as normal. For activity data, the phases come from the dataset. All active phases are filtered through pycalphad's `filter_phases` function, to ensue that no phases that cannot be active with the components subset are active. * Non-equilibrium thermochemical data: extra internal degrees of freedom are padded as zeros when constructing points.
HUISUN24 · Mar 3, 2020 · 3f632b3 · 3f632b3
1 parent 6cdc983
commit 3f632b3
Show file tree

Hide file tree

Showing 7 changed files with 365 additions and 7 deletions.
diff --git a/docs/quickstart.rst b/docs/quickstart.rst
@@ -112,6 +112,7 @@ These are created from arrays via ``numpy.save()`` and can thus be loaded with `
 Note that the arrays are preallocated with zeros.
 These filenames and settings can be changed using in the input file.
 You can then use these chains and corresponding log-probabilities to make corner plots, calculate autocorrelations, find optimal parameters for databases, etc..
+Some examples are shown in the :ref:`Recipes` page.
 Finally, you can use py:mod:`espei.plot` functions such as ``multiplot`` to plot phase diagrams with your input equilibria data and ``plot_parameters`` to compare single-phase data (e.g. formation and mixing data) with the properties calculated with your database.
 
 Q: Can I run ESPEI on a supercomputer supporting MPI?
@@ -133,7 +134,6 @@ The total log probability is the sum of all log probabilities.
 
 Note that any probability density function always returns a positive value between 0 and 1, so the log probability density function should return negative numbers and the log probability reported by ESPEI should be negative.
 
-:ref:`Writing input files`
 
 Q: Why is the version of ESPEI '0+unknown'?
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -142,6 +142,20 @@ A: A version number of ``'0+unknown'`` indicates that you do not have git instal
 This can occur on Windows where git is not in the PATH (and the Python interpreter cannot see it).
 You can install git using ``conda install git`` on Windows.
 
+
+Q: I have a large database, can I use ESPEI to optimize parameters in only a subsystem?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+A: Yes, if you have a multicomponent CALPHAD database, but want to optimize or
+determine the uncertainty for a constituent unary, binary or ternary subsystem
+that you have data for, you can do that without any extra effort.
+
+You may be interested in the :ref:`input_mcmc_symbols` input parameter to
+specify which parameter subset to optimize.
+
+Note that if you optimize parameters in a subsystem (e.g. Cu-Mg) that is used in a higher order description (e.g. Al-Cu-Mg), you may need to reoptimize the parameters for the higher order system as well.
+
+
 References
 ==========
 

diff --git a/docs/recipes.rst b/docs/recipes.rst
@@ -2,6 +2,8 @@
 
    \chapter{Recipes}
 
+.. _Recipes:
+
 =======
 Recipes
 =======

diff --git a/docs/writing_input.rst b/docs/writing_input.rst
@@ -453,6 +453,8 @@ used to modify the initial standard deviation of each data type by
     :alt: Data weight equation
     :scale: 100%
 
+.. _input_mcmc_symbols:
+
 symbols
 -------
 

diff --git a/espei/error_functions/activity_error.py b/espei/error_functions/activity_error.py
@@ -9,6 +9,7 @@
 
 from pycalphad import equilibrium, variables as v
 from pycalphad.plot.eqplot import _map_coord_to_variable
+from pycalphad.core.utils import filter_phases
 from scipy.stats import norm
 
 from espei.core_utils import ravel_conditions
@@ -121,8 +122,13 @@ def calculate_activity_error(dbf, comps, phases, datasets, parameters=None, phas
         acr_component = ds['output'].split('_')[1]  # the component of interest
         # calculate the reference state equilibrium
         ref = ds['reference_state']
+        # data_comps and data_phases ensures that we only do calculations on
+        # the subsystem of the system defining the data.
+        data_comps = ds['components']
+        species = list(map(v.Species, data_comps))
+        data_phases = filter_phases(dbf, species, candidate_phases=phases)
         ref_conditions = {_map_coord_to_variable(coord): val for coord, val in ref['conditions'].items()}
-        ref_result = equilibrium(dbf, ds['components'], ref['phases'], ref_conditions,
+        ref_result = equilibrium(dbf, data_comps, ref['phases'], ref_conditions,
                                  model=phase_models, parameters=parameters,
                                  callables=callables)
 
@@ -146,7 +152,7 @@ def calculate_activity_error(dbf, comps, phases, datasets, parameters=None, phas
         conditions_list = [{c: conditions[c][i] for c in conditions.keys()} for i in range(len(conditions[v.T]))]
         current_chempots = []
         for conds in conditions_list:
-            sample_eq_res = equilibrium(dbf, ds['components'], phases, conds,
+            sample_eq_res = equilibrium(dbf, data_comps, data_phases, conds,
                                         model=phase_models, parameters=parameters,
                                         callables=callables)
             current_chempots.append(sample_eq_res.MU.sel(component=acr_component).values.flatten()[0])

diff --git a/espei/error_functions/zpf_error.py b/espei/error_functions/zpf_error.py
@@ -24,7 +24,7 @@
 
 from pycalphad import Database, Model, variables as v
 from pycalphad.codegen.callables import build_phase_records
-from pycalphad.core.utils import instantiate_models
+from pycalphad.core.utils import instantiate_models, filter_phases
 from pycalphad.core.phase_rec import PhaseRecord
 from espei.utils import PickleableTinyDB
 from espei.shadow_functions import equilibrium_, calculate_, no_op_equilibrium_
@@ -119,7 +119,8 @@ def get_zpf_data(dbf: Database, comps: Sequence[str], phases: Sequence[str], dat
     for data in desired_data:
         data_comps = list(set(data['components']).union({'VA'}))
         species = sorted(map(v.Species, data_comps), key=str)
-        models = instantiate_models(dbf, species, phases, parameters=parameters)
+        data_phases = filter_phases(dbf, species, candidate_phases=phases)
+        models = instantiate_models(dbf, species, data_phases, parameters=parameters)
         all_regions = data['values']
         conditions = data['conditions']
         phase_regions = []
@@ -135,9 +136,9 @@ def get_zpf_data(dbf: Database, comps: Sequence[str], phases: Sequence[str], dat
             region_potential_conds[v.N] = region_potential_conds.get(v.N) or 1.0  # Add v.N condition, if missing
             # Extract all the phases and compositions from the tie-line points
             region_phases, region_comp_conds, phase_flags = extract_phases_comps(phase_region)
-            region_phase_records = [build_phase_records(dbf, species, phases, {**region_potential_conds, **comp_conds}, models, parameters=parameters, build_gradients=True, build_hessians=True)
+            region_phase_records = [build_phase_records(dbf, species, data_phases, {**region_potential_conds, **comp_conds}, models, parameters=parameters, build_gradients=True, build_hessians=True)
                                     for comp_conds in region_comp_conds]
-            phase_regions.append(PhaseRegion(region_phases, region_potential_conds, region_comp_conds, phase_flags, dbf, species, phases, models, region_phase_records))
+            phase_regions.append(PhaseRegion(region_phases, region_potential_conds, region_comp_conds, phase_flags, dbf, species, data_phases, models, region_phase_records))
 
         data_dict = {
             'weight': data.get('weight', 1.0),

diff --git a/tests/test_error_functions.py b/tests/test_error_functions.py
@@ -25,6 +25,27 @@ def test_activity_error(datasets_db):
     assert np.isclose(error, -257.41020886970756, rtol=1e-6)
 
 
+def test_subsystem_activity_probability(datasets_db):
+    """Test binary Cr-Ni data produces the same probability regardless of whether the main system is a binary or ternary."""
+
+    datasets_db.insert(CR_NI_ACTIVITY)
+
+    dbf_bin = Database(CR_NI_TDB)
+    dbf_tern = Database(CR_FE_NI_TDB)
+    phases = list(dbf_tern.phases.keys())
+
+    # Truth
+    bin_prob = calculate_activity_error(dbf_bin, ['CR','NI','VA'], phases, datasets_db, {}, {}, {})
+
+    # Getting binary subsystem data explictly (from binary input)
+    prob = calculate_activity_error(dbf_tern, ['CR','NI','VA'], phases, datasets_db, {}, {}, {})
+    assert np.isclose(prob, bin_prob)
+
+    # Getting binary subsystem from ternary input
+    prob = calculate_activity_error(dbf_tern, ['CR', 'FE', 'NI', 'VA'], phases, datasets_db, {}, {}, {})
+    assert np.isclose(prob, bin_prob)
+
+
 def test_non_equilibrium_thermochemical_error_with_multiple_X_points(datasets_db):
     """Multiple composition datapoints in a dataset for a mixing phase should be successful."""
     datasets_db.insert(CU_MG_CPM_MIX_X_HCP_A3)
@@ -164,6 +185,30 @@ def test_non_equilibrium_thermochemical_error_for_of_enthalpy_mixing(datasets_db
     assert np.isclose(error, zero_error_prob, atol=1e-6)
 
 
+def test_subsystem_non_equilibrium_thermochemcial_probability(datasets_db):
+    """Test binary Cr-Ni data produces the same probability regardless of whether the main system is a binary or ternary."""
+
+    datasets_db.insert(CR_NI_LIQUID_DATA)
+
+    dbf_bin = Database(CR_NI_TDB)
+    dbf_tern = Database(CR_FE_NI_TDB)
+    phases = list(dbf_tern.phases.keys())
+
+    # Truth
+    thermochemical_data = get_thermochemical_data(dbf_bin, ['CR', 'NI', 'VA'], phases, datasets_db)
+    bin_prob = calculate_non_equilibrium_thermochemical_probability(dbf_bin, thermochemical_data)
+
+    # Getting binary subsystem data explictly (from binary input)
+    thermochemical_data = get_thermochemical_data(dbf_tern, ['CR', 'NI', 'VA'], phases, datasets_db)
+    prob = calculate_non_equilibrium_thermochemical_probability(dbf_tern, thermochemical_data)
+    assert np.isclose(prob, bin_prob)
+
+    # Getting binary subsystem from ternary input
+    thermochemical_data = get_thermochemical_data(dbf_tern, ['CR', 'FE', 'NI', 'VA'], phases, datasets_db)
+    prob = calculate_non_equilibrium_thermochemical_probability(dbf_tern, thermochemical_data)
+    assert np.isclose(prob, bin_prob)
+
+
 def test_zpf_error_zero(datasets_db):
     """Test that sum of square ZPF errors returns 0 for an exactly correct result"""
     datasets_db.insert(CU_MG_DATASET_ZPF_ZERO_ERROR)
@@ -178,3 +223,27 @@ def test_zpf_error_zero(datasets_db):
     zpf_data = get_zpf_data(dbf, comps, phases, datasets_db, {})
     error = calculate_zpf_error(zpf_data, np.array([]))
     assert np.isclose(error, zero_error_prob, rtol=1e-6)
+
+
+def test_subsystem_zpf_probability(datasets_db):
+    """Test binary Cr-Ni data produces the same probability regardless of whether the main system is a binary or ternary."""
+
+    datasets_db.insert(CR_NI_ZPF_DATA)
+
+    dbf_bin = Database(CR_NI_TDB)
+    dbf_tern = Database(CR_FE_NI_TDB)
+    phases = list(dbf_tern.phases.keys())
+
+    # Truth
+    zpf_data = get_zpf_data(dbf_bin, ['CR', 'NI', 'VA'], phases, datasets_db, {})
+    bin_prob = calculate_zpf_error(zpf_data, np.array([]))
+
+    # Getting binary subsystem data explictly (from binary input)
+    zpf_data = get_zpf_data(dbf_tern, ['CR', 'NI', 'VA'], phases, datasets_db, {})
+    prob = calculate_zpf_error(zpf_data, np.array([]))
+    assert np.isclose(prob, bin_prob)
+
+    # Getting binary subsystem from ternary input
+    zpf_data = get_zpf_data(dbf_tern, ['CR', 'FE', 'NI', 'VA'], phases, datasets_db, {})
+    prob = calculate_zpf_error(zpf_data, np.array([]))
+    assert np.isclose(prob, bin_prob)
-Original file line number
+Diff line change
@@ Expand Up / @@ -2,6 +2,8 @@ @@
        \chapter{Recipes}
+    .. _Recipes:
     =======
     Recipes
     =======
@@ Expand Down @@