Source code for yank.repex

#!/usr/local/bin/env python

# ==============================================================================
# MODULE DOCSTRING
# ==============================================================================

"""
Repex
=====

Replica-exchange simulation algorithms and specific variants.

This module provides a general facility for running replica-exchange simulations,
as well as derived classes for special cases such as parallel tempering (in which
the states differ only in temperature).

Provided classes include:

- :class:`ReplicaExchange`
    Base class for general replica-exchange simulations.
- :class:`ParallelTempering`
    Convenience subclass of ReplicaExchange for parallel tempering simulations
    (one System object, many temperatures).
- :class:`Reporter`
    Replica Exchange reporter class to store all variables and data

COPYRIGHT

Current version by Andrea Rizzi <andrea.rizzi@choderalab.org>, Levi N. Naden <levi.naden@choderalab.org> and
John D. Chodera<john.chodera@choderalab.org> while at Memorial Sloan-Kettering Cancer Center.

Original version by John D. Chodera <jchodera@gmail.com> while at the University of
California Berkeley.

LICENSE

This code is licensed under the latest available version of the MIT License.

"""

# ==============================================================================
# GLOBAL IMPORTS
# ==============================================================================

from simtk import openmm
from simtk import unit

import os
import math
import copy
import time
import uuid
import yaml
import inspect
import logging
import datetime
import collections

import numpy as np
import mdtraj as md
import netCDF4 as netcdf

import openmmtools as mmtools
from openmmtools.states import ThermodynamicState

from . import utils, mpi, version
from pymbar.utils import ParameterError

logger = logging.getLogger(__name__)


# ==============================================================================
# REPLICA EXCHANGE REPORTER
# ==============================================================================

[docs]class Reporter(object): """Handle storage write/read operations and different format conventions. You can use this object to programmatically inspect the data generated by ReplicaExchange. Parameters ---------- storage : str The path to the storage file for analysis. A second checkpoint file will be determined from either ``checkpoint_storage`` or automatically based on the storage option In the future this will be able to take Storage classes as well. open_mode : str or None The mode of the file between 'r', 'w', and 'a' (or equivalently 'r+'). If None, the storage file won't be open on construction, and a call to :func:`Reporter.open` will be needed before attempting read/write operations. checkpoint_interval : int >= 1, Default: 10 The frequency at which checkpointing information is written relative to analysis information. This is a multiple of the iteration at which energies is written, hence why it must be greater than or equal to 1. Checkpoint information cannot be written on iterations which where ``iteration % checkpoint_interval != 0``. Attempting to read checkpointing information results in a masked array where only entries which were written are unmasked checkpoint_storage : str or None, optional Optional name of the checkpoint point file. This file is used to save trajectory information and other less frequently accessed data. This should NOT be a full path, and instead just a filename If None: the derived checkpoint name is the same as storage, less any extension, then "_checkpoint.nc" is added. The reporter internally tracks what data goes into which file, so its transparent to all other classes In the future, this will be able to take Storage classes as well analysis_particle_indices : tuple of ints, Optional. Default: () (empty tuple) Indices of particles which should be treated as special when manipulating read and write functions. If this is an empty tuple, then no particles are treated as special Attributes ---------- filepath """ def __init__(self, storage, open_mode=None, checkpoint_interval=10, checkpoint_storage=None, analysis_particle_indices=()): # Handle checkpointing if type(checkpoint_interval) != int: raise ValueError("checkpoint_interval must be an integer!") dirname, filename = os.path.split(storage) if checkpoint_storage is None: basename, ext = os.path.splitext(filename) addon = "_checkpoint" checkpoint_storage = os.path.join(dirname, basename + addon + ext) logger.debug("Initial checkpoint file automatically chosen as {}".format(checkpoint_storage)) else: checkpoint_storage = os.path.join(dirname, checkpoint_storage) self._storage_file_analysis = storage self._storage_file_checkpoint = checkpoint_storage self._storage_checkpoint = None self._storage_analysis = None self._checkpoint_interval = checkpoint_interval # Cast to tuple no mater what 1-D-like input was given self._analysis_particle_indices = tuple(analysis_particle_indices) if open_mode is not None: self.open(open_mode) @property def filepath(self): """ Returns the string file name of the primary storage file Classes outside the Reporter can access the file string for error messages and such. """ return self._storage_file_analysis @property def _storage(self): """ Return an iterable of the storage objects, avoids having the [list, of, storage, objects] everywhere Object 0 is always the primary file, all others are subfiles """ return self._storage_analysis, self._storage_checkpoint @property def _storage_paths(self): """ Return an iterable of paths to the storage files Object 0 is always the primary file, all others are subfiles """ return self._storage_file_analysis, self._storage_file_checkpoint @property def _storage_dict(self): """Return an iterable dictionary of the self._storage_X objects""" return {'checkpoint': self._storage_checkpoint, 'analysis': self._storage_analysis} @property def analysis_particle_indices(self): """Return the tuple of indices of the particles which additional information is stored on for analysis""" return self._analysis_particle_indices
[docs] def storage_exists(self, skip_size=False): """ Check if the storage files exist on disk. Reads information on the primary file to see existence of others Parameters ---------- skip_size : bool, Optional, Default: False Skip the check of the file size. Helpful if you have just initialized a storage file but written nothing to it yet and/or its still entirely in memory (e.g. just opened NetCDF files) Returns ------- files_exist : bool If the primary storage file and its related subfiles exist, returns True. If the primary file or any subfiles do not exist, returns False """ # This function serves as a way to mask the subfiles from everything outside the reporter for file_path in self._storage_paths: if not os.path.exists(file_path): return False # Return if any files do not exist elif not os.path.getsize(file_path) > 0 and not skip_size: return False # File is 0 size return True
[docs] def is_open(self): """Return True if the Reporter is ready to read/write.""" if self._storage[0] is None: return False else: return self._storage[0].isopen()
def _are_subfiles_open(self): """Internal function to check if subfiles are open""" open_check_list = [] for storage in self._storage[1:]: if storage is None: return False else: open_check_list.append(storage.isopen()) return np.all(open_check_list)
[docs] def open(self, mode='r'): """ Open the storage file for reading/writing. Creates and pre-formats the required files if they don't exist. This is not necessary if you have indicated in the constructor to open. Parameters ---------- mode : str, Optional, Default: 'r' The mode of the file between 'r', 'w', and 'a' (or equivalently 'r+'). """ # Ensure we don't have already another file # open (possibly in a different mode). for storage in self._storage: if storage is not None: self.close() # No need to execute this code more than once break # Create directory if we want to write. if mode != 'r': for storage_path in self._storage_paths: storage_dir = os.path.dirname(storage_path) # When storage_dir == '', os.path.exists() returns False. if not os.path.exists(storage_dir) and storage_dir != '': os.makedirs(storage_dir) # Open NetCDF 4 file for writing. # If no file exists, this appropriately throws an error on 'r' mode primary_ncfiles = {} sub_ncfiles = {} # TODO: Figure out how to move around the analysis file without the checkpoint file # Cast this to a common name space for operation below primary_ncfiles['analysis'] = netcdf.Dataset(self._storage_file_analysis, mode, version='NETCDF4') if mode == 'w': sub_ncfiles['checkpoint'] = netcdf.Dataset(self._storage_file_checkpoint, mode, version='NETCDF4') else: # Read/append mode # Try to open the files in read mode, its okay if they are not there try: sub_ncfiles['checkpoint'] = netcdf.Dataset(self._storage_file_checkpoint, mode, version='NETCDF4') primary_uuid = primary_ncfiles['analysis'].UUID assert primary_uuid == sub_ncfiles['checkpoint'].UUID except IOError: # Trap the "not on disk" warning logger.debug('Could not locate checkpoint subfile. This is okay for certain append and read operations, ' 'but not for production simulation!') except AttributeError: raise AttributeError("Checkpoint file is missing its linked primary file UUID!" "No way to ensure this checkpoint file contains data matching the analysis file!") except AssertionError: primary_uuid = primary_ncfiles['analysis'].UUID check_uuid = sub_ncfiles['checkpoint'].UUID raise IOError("Checkpoint UUID does not match analysis UUID! This checkpoint file came from another " "simulation!\n" "Analysis UUID: {}" "Checkpoint UUID: {}".format(primary_uuid, check_uuid)) def check_and_build_storage_file(ncfile, nc_name, checkpoint_interval): """ Helper function to build default file settings Returns a bool depending on if it ran through the setup to indicate file needs additional data or not """ if 'scalar' not in ncfile.dimensions: # Create common dimensions. ncfile.createDimension('scalar', 1) # Scalar dimension. ncfile.createDimension('iteration', 0) # Unlimited number of iterations. ncfile.createDimension('spatial', 3) # Number of spatial dimensions. # Set global attributes. ncfile.application = 'YANK' ncfile.program = 'yank.py' ncfile.programVersion = version.short_version ncfile.Conventions = 'ReplicaExchange' ncfile.ConventionVersion = '0.2' ncfile.DataUsedFor = nc_name ncfile.CheckpointInterval = checkpoint_interval # Create and initilize the global variables nc_last_good_iter = ncfile.createVariable('last_iteration', int, 'scalar') nc_last_good_iter[0] = 0 return True else: return False # Setup the primary file if needed # Generate a unique UUID in case we need to assign it # primary and subfiles are linked by UUID # Uses uuid4 to avoid assigning hostname information primary_uuid = str(uuid.uuid4()) ncfile_ever_built = False for nc_name, ncfile in primary_ncfiles.items(): ncfile_built = check_and_build_storage_file(ncfile, nc_name, self._checkpoint_interval) if ncfile_built: ncfile_ever_built = True # Assign ncfile to class property setattr(self, '_storage_' + nc_name, ncfile) if ncfile_ever_built: # Assign the same UUID to all primary files for _, ncfile in primary_ncfiles.items(): ncfile.UUID = primary_uuid else: primary_uuid = primary_ncfiles['analysis'].UUID # Handle Subfiles for nc_name, ncfile in sub_ncfiles.items(): # If they are in the sub_ncfiles dict, they exist and are open ncfile_built = check_and_build_storage_file(ncfile, nc_name, self._checkpoint_interval) if ncfile_built: # Assign the UUID to the subfile ncfile.UUID = primary_uuid setattr(self, '_storage_' + nc_name, ncfile) if self._storage_analysis is not None: # The same number will be on checkpoint file as well, but its not guaranteed to be present on_file_interval = self._storage_analysis.CheckpointInterval if on_file_interval != self._checkpoint_interval: logger.debug("checkpoint_interval != on-file checkpoint interval! " "Using on file analysis interval of {}.".format(on_file_interval)) self._checkpoint_interval = on_file_interval # Check the special particle indices # Handle the "variable does not exist" case if 'analysis_particle_indices' not in self._storage_analysis.variables: n_particles = len(self._analysis_particle_indices) # This dimension won't exist if the above statement does not either self._storage_analysis.createDimension('analysis_particles', n_particles) ncvar_analysis_particles = \ self._storage_analysis.createVariable('analysis_particle_indices', int, 'analysis_particles') ncvar_analysis_particles[:] = self._analysis_particle_indices ncvar_analysis_particles.long_name = ("analysis_particle_indices[analysis_particles] is the indices of " "the particles with extra information stored about them in the" "analysis file.") # Now handle the "variable does exist but does not match the provided ones" # Although redundant if it was just created, its an easy check to make stored_analysis_particles = self._storage_analysis.variables['analysis_particle_indices'][:] if self._analysis_particle_indices != tuple(stored_analysis_particles.astype(int)): logger.debug("analysis_particle_indices != on-file analysis_particle_indices!" "Using on file analysis indices of {}".format(stored_analysis_particles)) self._analysis_particle_indices = tuple(stored_analysis_particles.astype(int))
[docs] def close(self): """Close the storage files""" for storage_name, storage in self._storage_dict.items(): if storage is not None: if storage.isopen(): storage.sync() storage.close() setattr(self, '_storage' + storage_name, None)
[docs] def sync(self): """Force any buffer to be flushed to the file""" for storage in self._storage: if storage is not None: storage.sync()
def __del__(self): """Synchronize and close the storage.""" for storage in self._storage: if storage is not None: self.close() break @mmtools.utils.with_timer('Reading thermodynamic states from storage')
[docs] def read_thermodynamic_states(self): """Retrieve the stored thermodynamic states from the checkpoint file. Returns ------- thermodynamic_states : list of ThermodynamicStates The previously stored thermodynamic states. During the simulation these are swapped among replicas. unsampled_states : list of ThermodynamicState The previously stored unsampled thermodynamic states. See Also -------- read_replica_thermodynamic_states """ # We have to parse the thermodynamic states first because the # unsampled states may refer to them for the serialized system. states = collections.OrderedDict([('thermodynamic_states', list()), ('unsampled_states', list())]) # Caches standard_system_name: serialized_standard_system states_serializations = dict() # Read state information. for state_type, state_list in states.items(): # There may not be unsampled states. if state_type not in self._storage_analysis.groups: assert state_type == 'unsampled_states' continue # We keep looking for states until we can't find them anymore. n_states = len(self._storage_analysis.groups[state_type].variables) for state_id in range(n_states): serialized_state = self.read_dict('{}/state{}'.format(state_type, state_id)) # Find the thermodynamic state representation. serialized_thermodynamic_state = serialized_state while 'thermodynamic_state' in serialized_thermodynamic_state: # The while loop is necessary for nested CompoundThermodynamicStates. serialized_thermodynamic_state = serialized_thermodynamic_state['thermodynamic_state'] # Check if the standard state is in a previous state. try: standard_system_name = serialized_thermodynamic_state.pop('_Reporter__compatible_state') except KeyError: # Cache the standard system serialization for future usage. standard_system_name = '{}/{}'.format(state_type, state_id) states_serializations[standard_system_name] = serialized_thermodynamic_state['standard_system'] else: # The system serialization can be retrieved from another state. serialized_standard_system = states_serializations[standard_system_name] serialized_thermodynamic_state['standard_system'] = serialized_standard_system # Create ThermodynamicState object. states[state_type].append(mmtools.utils.deserialize(serialized_state)) state_id += 1 return [states['thermodynamic_states'], states['unsampled_states']]
@mmtools.utils.with_timer('Storing thermodynamic states')
[docs] def write_thermodynamic_states(self, thermodynamic_states, unsampled_states): """Store all the ThermodynamicStates to the checkpoint file. Parameters ---------- thermodynamic_states : list of ThermodynamicState The thermodynamic states to store. unsampled_states : list of ThermodynamicState The unsampled thermodynamic states to store. See Also -------- write_replica_thermodynamic_states """ # Store all thermodynamic states as serialized dictionaries. stored_states = dict() def unnest_thermodynamic_state(serialized): while 'thermodynamic_state' in serialized: serialized = serialized['thermodynamic_state'] return serialized for state_type, states in [('thermodynamic_states', thermodynamic_states), ('unsampled_states', unsampled_states)]: for state_id, state in enumerate(states): # Check if any compatible state has been found found_compatible_state = False for compare_state in stored_states: if compare_state.is_state_compatible(state): serialized_state = mmtools.utils.serialize(state, skip_system=True) serialized_thermodynamic_state = unnest_thermodynamic_state(serialized_state) serialized_thermodynamic_state.pop('standard_system') # Remove the unneeded system object reference_state_name = stored_states[compare_state] serialized_thermodynamic_state['_Reporter__compatible_state'] = reference_state_name found_compatible_state = True break # If no compatible state is found, do full serialization if not found_compatible_state: serialized_state = mmtools.utils.serialize(state) serialized_thermodynamic_state = unnest_thermodynamic_state(serialized_state) serialized_standard_system = serialized_thermodynamic_state['standard_system'] reference_state_name = '{}/{}'.format(state_type, state_id) len_serialization = len(serialized_standard_system) # Store new compatibility data stored_states[state] = reference_state_name logger.debug("Serialized state {} is {}B | {:.3f}KB | {:.3f}MB".format( reference_state_name, len_serialization, len_serialization/1024.0, len_serialization/1024.0/1024.0)) # Finally write the dictionary self.write_dict('{}/state{}'.format(state_type, state_id), serialized_state, fixed_dimension=True)
[docs] def read_sampler_states(self, iteration, analysis_particles_only=False): """Retrieve the stored sampler states on the checkpoint file If the iteration is not on the checkpoint interval, None is returned. Exception to this is if``analysis_particles_only``, see the Returns for output behavior. Parameters ---------- iteration : int The iteration at which to read the data. analysis_particles_only : bool, Optional, Default: False If set, will return the trajectory of ONLY the analysis particles flagged on original creation of the files Returns ------- sampler_states : list of SamplerStates or None The previously stored sampler states for each replica. If the iteration is not on the ``checkpoint_interval`` and the ``analysis_particles_only`` is not set, None is returned If ``analysis_particles_only`` is set, will return only the subset of particles which were defined by the ``analysis_particle_indices`` on reporter creation """ if analysis_particles_only: if len(self._analysis_particle_indices) == 0: raise ValueError("No particles were flagged for special analysis! " "No such trajectory would have been written!") return self._read_sampler_states_from_given_file(iteration, storage_file='analysis', obey_checkpoint_interval=False) else: return self._read_sampler_states_from_given_file(iteration, storage_file='checkpoint', obey_checkpoint_interval=True)
@mmtools.utils.with_timer('Storing sampler states')
[docs] def write_sampler_states(self, sampler_states, iteration): """Store all sampler states for a given iteration on the checkpoint file If the iteration is not on the checkpoint interval, only the ``analysis_particle_indices`` data is written, if set. Parameters ---------- sampler_states : list of SamplerStates The sampler states to store for each replica. iteration : int The iteration at which to store the data. """ # Case of no special atoms, write to normal checkpoint self._write_sampler_states_to_given_file(sampler_states, iteration, storage_file='checkpoint', obey_checkpoint_interval=True) if len(self._analysis_particle_indices) > 0: # Special case, pre-process the sampler_states sampler_subset = [] for sampler_state in sampler_states: positions = sampler_state.positions # Subset positions # Need the [arg, :] to get uniform behavior with tuple and list for arg # since a ndarray[tuple] is different than ndarray[list] position_subset = positions[self._analysis_particle_indices, :] sampler_subset.append(mmtools.states.SamplerState(position_subset, box_vectors=sampler_state.box_vectors)) self._write_sampler_states_to_given_file(sampler_subset, iteration, storage_file='analysis', obey_checkpoint_interval=False)
[docs] def read_replica_thermodynamic_states(self, iteration=slice(None)): """Retrieve the indices of the ThermodynamicStates for each replica on the analysis file Parameters ---------- iteration : int or slice The iteration(s) at which to read the data. The slice(None) allows fetching all iterations at once. Returns ------- state_indices : list of int At the given iteration, replica ``i`` propagated the system in SamplerState ``sampler_states[i]`` and ThermodynamicState ``thermodynamic_states[states_indices[i]]``. """ iteration = self._calculate_last_iteration(iteration) return self._storage_analysis.variables['states'][iteration].astype(np.int64)
[docs] def write_replica_thermodynamic_states(self, state_indices, iteration): """Store the indices of the ThermodynamicStates for each replica on the analysis file Parameters ---------- state_indices : list of int At the given iteration, replica ``i`` propagated the system in SamplerState ``sampler_states[i]`` and ThermodynamicState ``thermodynamic_states[replica_thermodynamic_states[i]]``. iteration : int The iteration at which to store the data. """ # Initialize schema if needed. if 'states' not in self._storage_analysis.variables: n_states = len(state_indices) # Create dimension if they don't exist. if 'replica' not in self._storage_analysis.dimensions: self._storage_analysis.createDimension('replica', n_states) # Create variables and attach units and description. ncvar_states = self._storage_analysis.createVariable('states', 'i4', ('iteration', 'replica'), zlib=False, chunksizes=(1, n_states)) setattr(ncvar_states, 'units', 'none') setattr(ncvar_states, "long_name", ("states[iteration][replica] is the thermodynamic state index " "(0..nstates-1) of replica 'replica' of iteration 'iteration'.")) # Store thermodynamic states indices. self._storage_analysis.variables['states'][iteration, :] = state_indices[:]
[docs] def read_mcmc_moves(self): """Return the MCMCMoves of the :class:`yank.repex.ReplicaExchange` simulation on the checkpoint Returns ------- mcmc_moves : list of MCMCMove The MCMCMoves used to propagate the simulation. """ n_moves = len(self._storage_analysis.groups['mcmc_moves'].variables) # Retrieve all moves in order. mcmc_moves = list() for move_id in range(n_moves): serialized_move = self.read_dict('mcmc_moves/move{}'.format(move_id)) mcmc_moves.append(mmtools.utils.deserialize(serialized_move)) return mcmc_moves
[docs] def write_mcmc_moves(self, mcmc_moves): """Store the MCMCMoves of the :class:`yank.repex.ReplicaExchange` simulation on the checkpoint Parameters ---------- mcmc_moves : list of MCMCMove The MCMCMoves used to propagate the simulation. """ for move_id, move in enumerate(mcmc_moves): serialized_move = mmtools.utils.serialize(move) self.write_dict('mcmc_moves/move{}'.format(move_id), serialized_move)
[docs] def read_energies(self, iteration=slice(None)): """Retrieve the energy matrix at the given iteration on the analysis file Parameters ---------- iteration : int or slice The iteration(s) at which to read the data. The slice(None) allows fetching all iterations at once. Returns ------- energy_thermodynamic_states : n_replicas x n_replicas numpy.ndarray ``energy_thermodynamic_states[iteration, i, j]`` is the reduced potential computed at SamplerState ``sampler_states[iteration, i]`` and ThermodynamicState ``thermodynamic_states[iteration, j]``. energy_unsampled_states : n_replicas x n_unsampled_states numpy.ndarray ``energy_unsampled_states[iteration, i, j]`` is the reduced potential computed at SamplerState ``sampler_states[iteration, i]`` and ThermodynamicState ``unsampled_thermodynamic_states[iteration, j]``. """ iteration = self._calculate_last_iteration(iteration) energy_thermodynamic_states = self._storage_analysis.variables['energies'][iteration, :, :] try: energy_unsampled_states = self._storage_analysis.variables['unsampled_energies'][iteration, :, :] except KeyError: # There are no unsampled thermodynamic states. unsampled_shape = energy_thermodynamic_states.shape[:-1] + (0,) energy_unsampled_states = np.zeros(unsampled_shape) return energy_thermodynamic_states, energy_unsampled_states
[docs] def write_energies(self, energy_thermodynamic_states, energy_unsampled_states, iteration): """Store the energy matrix at the given iteration on the analysis file Parameters ---------- energy_thermodynamic_states : n_replicas x n_replicas numpy.ndarray ``energy_thermodynamic_states[i][j]`` is the reduced potential computed at SamplerState ``sampler_states[i]`` and ThermodynamicState ``thermodynamic_states[j]``. energy_unsampled_states : n_replicas x n_unsampled_states numpy.ndarray ``energy_unsampled_states[i][j]`` is the reduced potential computed at SamplerState ``sampler_states[i]`` and ThermodynamicState ``unsampled_thermodynamic_states[j]``. iteration : int The iteration at which to store the data. """ # Initialize schema if needed. if 'energies' not in self._storage_analysis.variables: n_replicas = len(energy_thermodynamic_states) # Create replica dimension if it wasn't created by other functions. if 'replica' not in self._storage_analysis.dimensions: self._storage_analysis.createDimension('replica', n_replicas) # Create variable for thermodynamic state energies with units and descriptions. ncvar_energies = self._storage_analysis.createVariable('energies', 'f8', ('iteration', 'replica', 'replica'), zlib=False, chunksizes=(1, n_replicas, n_replicas)) ncvar_energies.units = 'kT' ncvar_energies.long_name = ("energies[iteration][replica][state] is the reduced (unitless) " "energy of replica 'replica' from iteration 'iteration' evaluated " "at the thermodynamic state 'state'.") # Check if we have unsampled states. if energy_unsampled_states.shape[1] > 0: if 'unsampled_energies' not in self._storage_analysis.variables: n_unsampled_states = len(energy_unsampled_states[0]) # Create replica dimension if it wasn't created by other functions. if 'unsampled' not in self._storage_analysis.dimensions: self._storage_analysis.createDimension('unsampled', n_unsampled_states) # Create variable for thermodynamic state energies with units and descriptions. ncvar_unsampled = self._storage_analysis.createVariable('unsampled_energies', 'f8', ('iteration', 'replica', 'unsampled'), zlib=False, chunksizes=(1, n_replicas, n_unsampled_states) ) ncvar_unsampled.units = 'kT' ncvar_unsampled.long_name = ("unsampled_energies[iteration][replica][state] is the reduced " "(unitless) energy of replica 'replica' from iteration 'iteration' " "evaluated at unsampled thermodynamic state 'state'.") # Store states energy. self._storage_analysis.variables['energies'][iteration, :, :] = energy_thermodynamic_states[:, :] if energy_unsampled_states.shape[1] > 0: self._storage_analysis.variables['unsampled_energies'][iteration, :, :] = energy_unsampled_states[:, :]
[docs] def read_mixing_statistics(self, iteration=slice(None)): """Retrieve the mixing statistics for the given iteration on the analysis file Parameters ---------- iteration : int or slice The iteration(s) at which to read the data. Returns ------- n_accepted_matrix : kxk numpy.ndarray ``n_accepted_matrix[i][j]`` is the number of accepted moves from state ``thermodynamic_states[i]`` to ``thermodynamic_states[j]`` going from ``iteration-1`` to ``iteration`` (not cumulative). n_proposed_matrix : kxk numpy.ndarray ``n_proposed_matrix[i][j]`` is the number of proposed moves from state ``thermodynamic_states[i]`` to ``thermodynamic_states[j]`` going from ``iteration-1`` to ``iteration`` (not cumulative). """ iteration = self._calculate_last_iteration(iteration) n_accepted_matrix = self._storage_analysis.variables['accepted'][iteration, :, :].astype(np.int64) n_proposed_matrix = self._storage_analysis.variables['proposed'][iteration, :, :].astype(np.int64) return n_accepted_matrix, n_proposed_matrix
[docs] def write_mixing_statistics(self, n_accepted_matrix, n_proposed_matrix, iteration): """Store the mixing statistics for the given iteration on the analysis file Parameters ---------- n_accepted_matrix : kxk numpy.ndarray ``n_accepted_matrix[i][j]`` is the number of accepted moves from state ``thermodynamic_states[i]`` to ``thermodynamic_states[j]`` going from iteration-1 to iteration (not cumulative). n_proposed_matrix : kxk numpy.ndarray ``n_proposed_matrix[i][j]`` is the number of proposed moves from state ``thermodynamic_states[i]`` to ``thermodynamic_states[j]`` going from ``iteration-1`` to ``iteration`` (not cumulative). iteration : int The iteration at which to store the data. """ # Create schema if necessary. if 'accepted' not in self._storage_analysis.variables: n_states = len(n_accepted_matrix) # Create replica dimension if it wasn't already created. if 'replica' not in self._storage_analysis.dimensions: self._storage_analysis.createDimension('replica', n_states) # Create variables with units and descriptions. ncvar_accepted = self._storage_analysis.createVariable('accepted', 'i4', ('iteration', 'replica', 'replica'), zlib=False, chunksizes=(1, n_states, n_states) ) ncvar_proposed = self._storage_analysis.createVariable('proposed', 'i4', ('iteration', 'replica', 'replica'), zlib=False, chunksizes=(1, n_states, n_states) ) setattr(ncvar_accepted, 'units', 'none') setattr(ncvar_proposed, 'units', 'none') setattr(ncvar_accepted, 'long_name', ("accepted[iteration][i][j] is the number of proposed transitions " "between states i and j from iteration 'iteration-1'.")) setattr(ncvar_proposed, 'long_name', ("proposed[iteration][i][j] is the number of proposed transitions " "between states i and j from iteration 'iteration-1'.")) # Store statistics. self._storage_analysis.variables['accepted'][iteration, :, :] = n_accepted_matrix[:, :] self._storage_analysis.variables['proposed'][iteration, :, :] = n_proposed_matrix[:, :]
[docs] def read_timestamp(self, iteration=slice(None)): """Return the timestamp for the given iteration. Read from the analysis file, although there is a paired timestamp on the checkpoint file as well Parameters ---------- iteration : int or slice The iteration(s) at which to read the data. Returns ------- timestamp : str The timestamp at which the iteration was stored. """ iteration = self._calculate_last_iteration(iteration) return self._storage_analysis.variables['timestamp'][iteration]
[docs] def write_timestamp(self, iteration): """Store a timestamp for the given iteration on both analysis and checkpoint file. If the iteration is not on the ``checkpoint_interval``, no timestamp is written on the checkpoint file Parameters ---------- iteration : int The iteration at which to read the data. """ # Create variable if needed. for storage in self._storage: if 'timestamp' not in storage.variables: storage.createVariable('timestamp', str, ('iteration',), zlib=False, chunksizes=(1,)) timestamp = time.ctime() self._storage_analysis.variables['timestamp'][iteration] = timestamp checkpoint_iteration = self._calculate_checkpoint_iteration(iteration) if checkpoint_iteration is not None: self._storage_checkpoint.variables['timestamp'][checkpoint_iteration] = timestamp
[docs] def read_dict(self, name): """Restore a dictionary from the storage file. Parameters ---------- name : str The identifier of the dictionary used to stored the data. Returns ------- data : dict The restored data as a dict. """ # Leaving the skeleton to extend this in for now storage = 'analysis' if storage not in ['analysis', 'checkpoint']: raise ValueError("storage must be either 'analysis' or 'checkpoint'!") # Get NC variable. nc_variable = self._resolve_variable_path(name, storage) if nc_variable.dtype == 'S1': # Handle variables stored in fixed_dimensions data_chars = nc_variable[:] data_str = data_chars.tostring().decode() else: data_str = str(nc_variable[0]) data = yaml.load(data_str, Loader=_DictYamlLoader) # Restore the title in the metadata. if name == 'metadata': data['title'] = self._storage_dict[storage].title return data
[docs] def write_dict(self, name, data, fixed_dimension=False): """Store the contents of a dict. Parameters ---------- name : str The identifier of the dictionary in the storage file. data : dict The dict to store. fixed_dimension: bool, Defautlt: False Use a fixed length dimension instead of variable length one. A unique dimension name (sharing a name with "name") will be created and its length will be set equal to ``"fixedL{}".format(len(data_string))`` This method seems to allow NetCDF to actually compress strings. Do NOT use this flag if you expect to constantly be changing the length of the data fed in, use only for static data """ # Leaving the skeleton to extend this in for now storage = 'analysis' if storage not in ['analysis', 'checkpoint']: raise ValueError("storage must be either 'analysis' or 'checkpoint'!") storage_nc = self._storage_dict[storage] # General NetCDF conventions assume the title of the dataset to be # specified as a global attribute, but the user can specify their # own titles only in metadata. if name == 'metadata': data = copy.deepcopy(data) self._storage_dict[storage].title = data.pop('title') # Activate flow style to save space. data_str = yaml.dump(data, Dumper=_DictYamlDumper)#, default_flow_style=True) # Check if we are updating the dictionary or creating it. try: nc_variable = self._resolve_variable_path(name, storage) except KeyError: if fixed_dimension: len_dim = len(data_str) dim_str = "fixedL{}".format(len_dim) if dim_str not in storage_nc.dimensions: storage_nc.createDimension(dim_str, len_dim) nc_variable = storage_nc.createVariable(name, 'S1', dim_str, zlib=True) else: nc_variable = storage_nc.createVariable(name, str, 'scalar', zlib=True) if fixed_dimension: data_char_array = np.array(list(data_str), dtype='S1') nc_variable[:] = data_char_array else: packed_data = np.empty(1, 'O') packed_data[0] = data_str nc_variable[:] = packed_data
[docs] def read_last_iteration(self, full_iteration=True): """ Read the last iteration from file which was written in sequential order. Parameters ---------- full_iteration : bool, optional If True, returns the last checkpoint iteration (default is True). Returns ------- last_iteration : int Last iteration which was sequentially written. """ # Make sure this is returned as Python int. last_iteration = int(self._storage_analysis.variables['last_iteration'][0]) # Get last checkpoint. if full_iteration: # -1 for stop ensures the 0th index is searched. for i in range(last_iteration, -1, -1): if self._calculate_checkpoint_iteration(i) is not None: return i raise RuntimeError("Could not find a checkpoint! This should not happen " "as the 0th iteration should always be written!") return last_iteration
[docs] def write_last_iteration(self, iteration): """ Tell the reporter what the last iteration which was written in sequential order was to allow resuming and analysis only on valid data. Call this as the last step of any ``write_iteration``-like routine to ensure analysis will not use junk data left over from an interrupted simulation past the last checkpoint. Parameters ---------- iteration : int Iteration at which the last good data point was written. """ self._storage_analysis.variables['last_iteration'][0] = iteration
[docs] def read_mbar_free_energies(self, iteration): """ Read the MBAR dimensionless free energy at a given iteration from file. These free energies are relative to the first state. These are computed through self-consistent iterations from an initial guess. The initial guess is often 0 for all states, so any state not written is returned as zeros for f_k, and infinity for Used primarily in online analysis, and should be used in tandem with an :class:`yank.analyze.YankPhaseAnalyzer` object from the :mod:`yank.analyze` module Parameters ---------- iteration : int iteration to read the free energies from if the iteration was not written at a the given iteration, then the free_energies are all 0 Returns ------- f_k : 1-D ndarray of floats MBAR free_energies from the iteration. free_energy : tuple of two Floats Free energy estimate at the iteration between the end states """ online_group = self._storage_analysis.groups['online_analysis'] f_k = online_group.variables['f_k'][iteration] free_energy = online_group.variables['free_energy'][iteration, :] return f_k, free_energy
[docs] def write_mbar_free_energies(self, iteration, f_k, free_energy): """ Write the mbar free energies at the current iteration. See :func:`read_mbar_free_energies` for more information about pymbar's f_k free energies Used primarily in online analysis, and should be used in tandem with an :class:`yank.analyze.YankPhaseAnalyzer` object from the :mod:`analyze` module. Parameters ---------- iteration : int, Iteration at which to save the free energy. Reads the current energy up to this value and stores it in the analysis reporter f_k : 1D array of floats, Dimensionless free energies you wish to store. This should come from pymbar.MBAR.f_k from an initialized MBAR object. free_energy : tuple of two floats Current estimate of the free energy difference of the phase and its error. This should be of the form (free_energy, error_in_free_energy) Typically output of the ``pymbar.MBAR.getFreeEnergyDifferences()[i,j]`` """ analysis_nc = self._storage_analysis if 'f_k_length' not in analysis_nc.dimensions: analysis_nc.createDimension('f_k_length', len(f_k)) analysis_nc.createDimension('Df', 2) online_group = analysis_nc.createGroup('online_analysis') # larger chunks, faster operations, small matrix anyways online_group.createVariable('f_k', float, dimensions=('iteration', 'f_k_length'), zlib=True, chunksizes=(1, len(f_k)), fill_value=np.inf) online_group.createVariable('free_energy', float, dimensions=('iteration', 'Df'), zlib=True, chunksizes=(1, 2), fill_value=np.inf) online_group = analysis_nc.groups['online_analysis'] online_group.variables['f_k'][iteration] = f_k online_group.variables['free_energy'][iteration, :] = free_energy
# ------------------------------------------------------------------------- # Internal-usage. # ------------------------------------------------------------------------- def _resolve_variable_path(self, path, storage): """Return the NC variable at the end of the path. This can be used to retrieve variables inside one or more groups. """ path_split = path.split('/') nc_group = self._storage_dict[storage] for group_name in path_split[:-1]: nc_group = nc_group.groups[group_name] return nc_group.variables[path_split[-1]] def _calculate_checkpoint_iteration(self, iteration): """Compute the iteration on disk of the checkpoint file matching the iteration linked on the analysis iteration. Although this is a simple function, it provides a common function for calculation Returns either the integer index, or None if there is no matched index """ checkpoint_index = float(iteration) / self._checkpoint_interval if checkpoint_index.is_integer(): return int(checkpoint_index) return None def _calculate_last_iteration(self, iteration): """ Convert the iteration in 'read_X' functions which take a iteration=slice(None) to avoid returning a slice of data which goes past the last_iteration Parameters ---------- iteration : int or Slice Iteration to feed into the check Returns ------- cast_iteration : int or Slice of type iteration Iteration, converted as needed to only access certain ranges of data """ last_good = self.read_last_iteration(full_iteration=False) max_iter = len(self._storage_analysis.dimensions['iteration']) def convert_negative_iteration(iteration): return iteration - (max_iter - last_good - 1) if last_good == max_iter - 1: cast_iteration = iteration else: if type(iteration) is int: throw = False if iteration > last_good: # check positive integer throw = True elif iteration < 0: # Operation on negative integer, recast # Convert negative integer to its mapped values cast_iteration = convert_negative_iteration(iteration) if cast_iteration < -max_iter: # check against absolute size throw = True else: cast_iteration = iteration if throw: raise IndexError("Index out of range!") elif type(iteration) is slice: out_start = None out_stop = None # Condition the start, start must not exceed last good (positive) and must be greater than if iteration.start is not None: if iteration.start > last_good: out_start = last_good elif iteration.start < 0: out_start = convert_negative_iteration(iteration.start) # Condition end iteration if iteration.stop is not None: if iteration.stop > last_good: out_stop = last_good + 1 elif iteration.stop < 0: out_stop = convert_negative_iteration(iteration.stop) - 1 else: # Trap special None cases for the "stop" # Trap the case of -step size, its easier to do all the conditions on which it is not that if iteration.step is None or (iteration.step is not None and iteration.step > 0): out_stop = last_good + 1 cast_iteration = slice(out_start, out_stop, iteration.step) else: raise ValueError("Iteration must be either an int or a slice!") return cast_iteration @staticmethod def _initilize_sampler_variables_on_file(dataset, n_atoms, n_states): """ Initialize the NetCDF variables on the storage file needed to store sampler states. Does nothing if file already initilzied Parameters ---------- dataset : NetCDF4 Dataset Dataset to validate n_atoms : int Number of atoms which will be stored n_states : int Number of Sampler states which will be written """ if 'positions' not in dataset.variables: # Create dimensions. Replica dimension could have been created before. dataset.createDimension('atom', n_atoms) if 'replica' not in dataset.dimensions: dataset.createDimension('replica', n_states) # Create variables. ncvar_positions = dataset.createVariable('positions', 'f4', ('iteration', 'replica', 'atom', 'spatial'), zlib=True, chunksizes=(1, n_states, n_atoms, 3)) ncvar_box_vectors = dataset.createVariable('box_vectors', 'f4', ('iteration', 'replica', 'spatial', 'spatial'), zlib=False, chunksizes=(1, n_states, 3, 3)) ncvar_volumes = dataset.createVariable('volumes', 'f8', ('iteration', 'replica'), zlib=False, chunksizes=(1, n_states)) # Define units for variables. setattr(ncvar_positions, 'units', 'nm') setattr(ncvar_box_vectors, 'units', 'nm') setattr(ncvar_volumes, 'units', 'nm**3') # Define long (human-readable) names for variables. setattr(ncvar_positions, "long_name", ("positions[iteration][replica][atom][spatial] is position of " "coordinate 'spatial' of atom 'atom' from replica 'replica' for " "iteration 'iteration'.")) setattr(ncvar_box_vectors, "long_name", ("box_vectors[iteration][replica][i][j] is dimension j of box " "vector i for replica 'replica' from iteration 'iteration-1'.")) setattr(ncvar_volumes, "long_name", ("volume[iteration][replica] is the box volume for replica 'replica' " "from iteration 'iteration-1'.")) def _write_sampler_states_to_given_file(self, sampler_states, iteration, storage_file='checkpoint', obey_checkpoint_interval=True): """ Internal function to handle writing sampler states more generically to target file Parameters ---------- sampler_states : list of openmmtools.states.SamplerStates The sampler states to store for each replica. iteration : int The iteration at which to store the data. storage_file : string, Optional, Default: 'checkpoint' Name of storage file we're writing to. Must match a valid key of self._storage_dict obey_checkpoint_interval : bool, Optional, Default: False Tells this (attempted) write to obey the checkpoint interval or not. If True, no write out will be done if iteration is not on the checkpoint interval If False, the write WILL occur """ storage = self._storage_dict[storage_file] # Check if the schema must be initialized, do this regardless of the checkpoint_interval for consistency self._initilize_sampler_variables_on_file(storage, sampler_states[0].n_particles, len(sampler_states)) if obey_checkpoint_interval: write_iteration = self._calculate_checkpoint_iteration(iteration) else: write_iteration = iteration # Write the sampler state if we are on the checkpoint interval OR if told to ignore the interval if write_iteration is not None: # Store sampler states. for replica_index, sampler_state in enumerate(sampler_states): # Store positions x = sampler_state.positions / unit.nanometers storage.variables['positions'][write_iteration, replica_index, :, :] = x[:, :] # Store box vectors and volume. for i in range(3): vector_i = sampler_state.box_vectors[i] / unit.nanometers storage.variables['box_vectors'][write_iteration, replica_index, i, :] = vector_i storage.variables['volumes'][write_iteration, replica_index] = \ sampler_state.volume / unit.nanometers ** 3 else: logger.debug("Iteration {} not on the Checkpoint Interval of {}. " "Sampler State not written.".format(iteration, self._checkpoint_interval)) def _read_sampler_states_from_given_file(self, iteration, storage_file='checkpoint', obey_checkpoint_interval=True): """ Internal function to handle reading sampler states more from a general storage file Parameters ---------- iteration : int Iteration on which to read data from storage_file : string, Optional, Default: 'checkpoint' Name of storage file we're writing to. Must match a valid key of self._storage_dict obey_checkpoint_interval : bool, Optional, Default: False Tells this (attempted) write to obey the checkpoint interval or not. If True, no write out will be done if iteration is not on the checkpoint interval If False, the read will be attempted regardless WARNING: If the storage file you specify was written on the checkpoint interval and you set obey_checkpoint_interval=False, you will get undefined behavior! Returns ------- sampler_states : list of SamplerStates or None The previously stored sampler states for each replica. If the iteration is not on the checkpoint_interval and the file only writes on the checkpoint_interval, None is returned """ storage = self._storage_dict[storage_file] if obey_checkpoint_interval: iteration = self._calculate_last_iteration(iteration) read_iteration = self._calculate_checkpoint_iteration(iteration) else: read_iteration = iteration if read_iteration is not None: n_states = storage.dimensions['replica'].size sampler_states = list() for replica_index in range(n_states): # Restore positions. x = storage.variables['positions'][read_iteration, replica_index, :, :].astype(np.float64) positions = unit.Quantity(x, unit.nanometers) # Restore box vectors. x = storage.variables['box_vectors'][read_iteration, replica_index, :, :].astype(np.float64) box_vectors = unit.Quantity(x, unit.nanometers) # Create SamplerState. sampler_states.append(mmtools.states.SamplerState(positions=positions, box_vectors=box_vectors)) return sampler_states else: return None
# ============================================================================== # REPLICA-EXCHANGE SIMULATION # ==============================================================================
[docs]class ReplicaExchange(object): """Replica-exchange simulation facility. This base class provides a general replica-exchange simulation facility, allowing any set of thermodynamic states to be specified, along with a set of initial positions to be assigned to the replicas in a round-robin fashion. No distinction is made between one-dimensional and multidimensional replica layout. By default, the replica mixing scheme attempts to mix *all* replicas to minimize slow diffusion normally found in multidimensional replica exchange simulations (Modification of the 'replica_mixing_scheme' setting will allow the traditional 'neighbor swaps only' scheme to be used.) Stored configurations, energies, swaps, and restart information are all written to a single output file using the platform portable, robust, and efficient NetCDF4 library. Parameters ---------- mcmc_moves : MCMCMove or list of MCMCMove, optional The MCMCMove used to propagate the states. If a list of MCMCMoves, they will be assigned to the correspondent thermodynamic state on creation. If None is provided, Langevin dynamics with 2fm timestep, 5.0/ps collision rate, and 500 steps per iteration will be used. number_of_iterations : int or infinity, optional, default: 1 The number of iterations to perform. Both ``float('inf')`` and ``numpy.inf`` are accepted for infinity. If you set this to infinity, be sure to set also ``online_analysis_interval``. replica_mixing_scheme : 'swap-all', 'swap-neighbors' or None, Default: 'swap-all' The scheme used to swap thermodynamic states between replicas. online_analysis_interval : None or Int >= 1, optional, default None Choose the interval at which to perform online analysis of the free energy. After every interval, the simulation will be stopped and the free energy estimated. If the error in the free energy estimate is at or below ``online_analysis_target_error``, then the simulation will be considered completed. online_analysis_target_error : float >= 0, optional, default 0.2 The target error for the online analysis measured in kT per phase. Once the free energy is at or below this value, the phase will be considered complete. If ``online_analysis_interval`` is None, this option does nothing. online_analysis_minimum_iterations : int >= 0, optional, default 50 Set the minimum number of iterations which must pass before online analysis is carried out. Since the initial samples likely not to yield a good estimate of free energy, save time and just skip them If ``online_analysis_interval`` is None, this does nothing Attributes ---------- n_replicas iteration mcmc_moves sampler_states metadata is_completed Examples -------- Parallel tempering simulation of alanine dipeptide in implicit solvent (replica exchange among temperatures). This is just an illustrative example; use :class:`ParallelTempering` class for actual production parallel tempering simulations. Create the system. >>> import math >>> from simtk import unit >>> from openmmtools import testsystems, states, mcmc >>> testsystem = testsystems.AlanineDipeptideImplicit() Create thermodynamic states for parallel tempering with exponentially-spaced schedule. >>> n_replicas = 3 # Number of temperature replicas. >>> T_min = 298.0 * unit.kelvin # Minimum temperature. >>> T_max = 600.0 * unit.kelvin # Maximum temperature. >>> temperatures = [T_min + (T_max - T_min) * (math.exp(float(i) / float(nreplicas-1)) - 1.0) / (math.e - 1.0) ... for i in range(n_replicas)] >>> thermodynamic_states = [states.ThermodynamicState(system=testsystem.system, temperature=T) ... for T in temperatures] Initialize simulation object with options. Run with a GHMC integrator. >>> move = mcmc.GHMCMove(timestep=2.0*unit.femtoseconds, n_steps=50) >>> simulation = ReplicaExchange(mcmc_moves=move, number_of_iterations=2) Create simulation with its storage file (in a temporary directory) and run. >>> storage_path = tempfile.NamedTemporaryFile(delete=False).name + '.nc' >>> reporter = Reporter(storage_path, checkpoint_interval=1) >>> simulation.create(thermodynamic_states=thermodynamic_states, >>> sampler_states=states.SamplerState(testsystem.positions), >>> storage=reporter) >>> simulation.run() # This runs for a maximum of 2 iterations. >>> simulation.iteration 2 >>> simulation.run(n_iterations=1) >>> simulation.iteration 2 To resume a simulation from an existing storage file and extend it beyond the original number of iterations. >>> del simulation >>> simulation = ReplicaExchange.from_storage(reporter) >>> simulation.extend(n_iterations=1) >>> simulation.iteration 3 You can extract several information from the NetCDF file using the Reporter class while the simulation is running. This reads the SamplerStates of every run iteration. >>> reporter = Reporter(storage=storage_path, open_mode='r', checkpoint_interval=1) >>> sampler_states = reporter.read_sampler_states(iteration=range(1, 4)) >>> len(sampler_states) 3 >>> sampler_states[-1].positions.shape # Alanine dipeptide has 22 atoms. (22, 3) Clean up. >>> os.remove(storage_path) :param number_of_iterations: Maximum number of integer iterations that will be run :param replica_mixing_scheme: Scheme which describes how replicas are exchanged each iteration as string :param online_analysis_interval: How frequently to carry out online analysis in number of iterations :param online_analysis_target_error: Target free energy difference error float at which simulation will be stopped during online analysis, in dimensionless energy :param online_analysis_minimum_iterations: Minimum number of iterations needed before online analysis is run as int """ # ------------------------------------------------------------------------- # Constructors. # ------------------------------------------------------------------------- def __init__(self, mcmc_moves=None, number_of_iterations=1, replica_mixing_scheme='swap-all', online_analysis_interval=None, online_analysis_target_error=0.2, online_analysis_minimum_iterations=50): # These will be set on initialization. See function # create() for explanation of single variables. self._thermodynamic_states = None self._unsampled_states = None self._sampler_states = None self._replica_thermodynamic_states = None self._iteration = None self._energy_thermodynamic_states = None self._energy_unsampled_states = None self._n_accepted_matrix = None self._n_proposed_matrix = None self._reporter = None self._metadata = None # Handling default propagator. if mcmc_moves is None: # This will be converted to a list in create(). self._mcmc_moves = mmtools.mcmc.LangevinDynamicsMove(timestep=2.0*unit.femtosecond, collision_rate=5.0/unit.picosecond, n_steps=500, reassign_velocities=True, n_restart_attempts=6) else: self._mcmc_moves = copy.deepcopy(mcmc_moves) # Store constructor parameters. Everything is marked for internal # usage because any change to these attribute implies a change # in the storage file as well. Use properties for checks. self.number_of_iterations = number_of_iterations self.replica_mixing_scheme = replica_mixing_scheme # Online analysis options. self.online_analysis_interval = online_analysis_interval self.online_analysis_target_error = online_analysis_target_error self.online_analysis_minimum_iterations = online_analysis_minimum_iterations self._online_error_trap_counter = 0 # Counter for errors in the online estimate self._online_error_bank = [] self._last_mbar_f_k = None self._last_err_free_energy = None self._have_displayed_citations_before = False # Check convergence. if self.number_of_iterations == np.inf: if self.online_analysis_target_error == 0.0: logger.warning("WARNING! You have specified an unlimited number of iterations and a target error " "for online analysis of 0.0! Your simulation may never reach 'completed' state!") elif self.online_analysis_interval is None: logger.warning("WARNING! This simulation will never be considered 'complete' since there is no " "specified maximum number of iterations!") @classmethod
[docs] def from_storage(cls, storage): """Constructor from an existing storage file. Parameters ---------- storage : str or Reporter If str: The path to the storage file. If :class:`Reporter`: uses the :class:`Reporter` options In the future this will be able to take a Storage class as well. Returns ------- repex : ReplicaExchange A new instance of ReplicaExchange in the same state of the last stored iteration. """ # Check if netcdf file exists, open data for read if it does if type(storage) is str: # Open a reporter to read the data. reporter = Reporter(storage, open_mode='r') file_name = storage else: reporter = storage reporter.open(mode='r') file_name = reporter.filepath if not reporter.storage_exists(): reporter.close() raise RuntimeError('Storage file {} or its subfiles do not exist; cannot resume.'.format(file_name)) # Retrieve options and create new simulation. options = reporter.read_dict('options') options['mcmc_moves'] = reporter.read_mcmc_moves() repex = cls(**options) # Display papers to be cited. repex._display_citations() # Read the last iteration reported to ensure we don't include junk # data written just before a crash. iteration = reporter.read_last_iteration() # Retrieve other attributes. logger.debug("Reading storage file {}...".format(file_name)) thermodynamic_states, unsampled_states = reporter.read_thermodynamic_states() sampler_states = reporter.read_sampler_states(iteration=iteration) state_indices = reporter.read_replica_thermodynamic_states(iteration=iteration) energy_thermodynamic_states, energy_unsampled_states = reporter.read_energies(iteration=iteration) n_accepted_matrix, n_proposed_matrix = reporter.read_mixing_statistics(iteration=iteration) metadata = reporter.read_dict('metadata') # Search for last cached free energies only if online analysis is activated. if repex._online_analysis_interval is not None: online_anaysis_info = repex._get_last_written_free_energy(reporter, iteration) last_mbar_f_k, (_, last_err_free_energy) = online_anaysis_info else: last_mbar_f_k, (_, last_err_free_energy) = None, (None, None) # Close reading reporter. reporter.close() # Assign attributes. repex._iteration = iteration repex._thermodynamic_states = thermodynamic_states repex._unsampled_states = unsampled_states repex._sampler_states = sampler_states repex._replica_thermodynamic_states = state_indices repex._energy_thermodynamic_states = energy_thermodynamic_states repex._energy_unsampled_states = energy_unsampled_states repex._n_accepted_matrix = n_accepted_matrix repex._n_proposed_matrix = n_proposed_matrix repex._metadata = metadata repex._last_mbar_f_k = last_mbar_f_k repex._last_err_free_energy = last_err_free_energy # We open the reporter only in node 0. repex._reporter = reporter mpi.run_single_node(0, repex._reporter.open, mode='a', broadcast_result=False, sync_nodes=False) # Don't write the new last iteration, we have not technically written anything yet, so there is no "junk" return repex
# ------------------------------------------------------------------------- # Public properties. # ------------------------------------------------------------------------- @property def n_replicas(self): """The integer number of replicas (read-only).""" if self._thermodynamic_states is None: return 0 else: return len(self._thermodynamic_states) @property def iteration(self): """The integer current iteration of the simulation (read-only). If the simulation has not been created yet, this is None. """ return self._iteration @property def mcmc_moves(self): """A copy of the MCMCMoves list used to propagate the simulation. This can be set only before creation. """ return copy.deepcopy(self._mcmc_moves) @mcmc_moves.setter def mcmc_moves(self, new_value): if self._thermodynamic_states is not None: # We can't modify representation of the MCMCMoves because it's # impossible to delete groups/variables from an NetCDF file. We # could support this by JSONizing the dict serialization and # store it as a string instead, if we needed this. raise RuntimeError('Cannot modify MCMCMoves after creation.') # If this is a single MCMCMove, it'll be transformed to a list in create(). self._mcmc_moves = copy.deepcopy(new_value) @property def sampler_states(self): """A copy of the sampler states list at the current iteration. This can be set only before running. """ return copy.deepcopy(self._sampler_states) @sampler_states.setter def sampler_states(self, value): if self._iteration != 0: raise RuntimeError('Sampler states can be assigned only between ' 'create() and run().') if len(value) != self.n_replicas: raise ValueError('Passed {} sampler states for {} replicas'.format( len(value), self.n_replicas)) # Update sampler state in the object and on storage. self._sampler_states = copy.deepcopy(value) mpi.run_single_node(0, self._reporter.write_sampler_states, self._sampler_states, self._iteration) class _StoredProperty(object): """ Descriptor of a property stored as an option. validate_function is a simple function for checking things like "X > 0", but exposes both the ReplicaExchange instance and the new value for the variable, in that order. More complex checks which relies on the ReplicaExchange instance, like "if Y == True, then check X" can be accessed through the instance object of the function """ def __init__(self, option_name, validate_function=None): self._option_name = option_name self._validate_function = validate_function def __get__(self, instance, owner_class=None): return getattr(instance, '_' + self._option_name) def __set__(self, instance, new_value): if self._validate_function is not None: new_value = self._validate_function(instance, new_value) setattr(instance, '_' + self._option_name, new_value) # Update storage if we ReplicaExchange is initialized. if instance._thermodynamic_states is not None: mpi.run_single_node(0, instance._store_options) # ---------------------------------- # Value Validation of the properties # Should be @staticmethod with arguments of (instance, value) in that order, even if instance is not used # ---------------------------------- @staticmethod def _number_of_iterations_validator(instance, number_of_iterations): # Support infinite number of iterations. if not (0 <= number_of_iterations <= float('inf')): raise ValueError('Accepted values for number_of_iterations are' 'non-negative integers and infinity.') return number_of_iterations @staticmethod def _repex_mixing_scheme_validator(instance, replica_mixing_scheme): supported_schemes = ['swap-all', 'swap-neighbors', None] if replica_mixing_scheme not in supported_schemes: raise ValueError("Unknown replica mixing scheme '{}'. Supported values " "are {}.".format(replica_mixing_scheme, supported_schemes)) return replica_mixing_scheme @staticmethod def _oa_interval_validator(instance, online_analysis_interval): """Check the online_analysis_interval value for consistency""" if online_analysis_interval is not None and ( type(online_analysis_interval) != int or online_analysis_interval < 1): raise ValueError("online_analysis_interval must be an integer 1 or greater, or None") return online_analysis_interval @staticmethod def _oa_target_error_validator(instance, online_analysis_target_error): if instance.online_analysis_interval is not None: if online_analysis_target_error < 0: raise ValueError("online_analysis_target_error must be a float >= 0") elif online_analysis_target_error == 0: logger.warning("online_analysis_target_error of 0 may never converge.") return online_analysis_target_error @staticmethod def _oa_min_iter_validator(instance, online_analysis_minimum_iterations): if (instance.online_analysis_interval is not None and (type(online_analysis_minimum_iterations) is not int or online_analysis_minimum_iterations < 0)): raise ValueError("online_analysis_minimum_iterations must be an integer >= 0") return online_analysis_minimum_iterations number_of_iterations = _StoredProperty('number_of_iterations', validate_function=_StoredProperty._number_of_iterations_validator) replica_mixing_scheme = _StoredProperty('replica_mixing_scheme', validate_function=_StoredProperty._repex_mixing_scheme_validator) online_analysis_interval = _StoredProperty('online_analysis_interval', validate_function=_StoredProperty._oa_interval_validator) #:interval to carry out online analysis online_analysis_target_error = _StoredProperty('online_analysis_target_error', validate_function=_StoredProperty._oa_target_error_validator) online_analysis_minimum_iterations = _StoredProperty('online_analysis_minimum_iterations', validate_function=_StoredProperty._oa_min_iter_validator) @property def metadata(self): """A copy of the metadata dictionary passed on creation (read-only).""" return copy.deepcopy(self._metadata) @property def is_completed(self): """Check if we have reached any of the stop target criteria (read-only)""" return self._is_completed() # ------------------------------------------------------------------------- # Main public interface. # -------------------------------------------------------------------------
[docs] def create(self, thermodynamic_states, sampler_states, storage, unsampled_thermodynamic_states=None, metadata=None): """Create new replica-exchange simulation. Parameters ---------- thermodynamic_states : list of openmmtools.states.ThermodynamicState Thermodynamic states to simulate, where one replica is allocated per state. Each state must have a system with the same number of atoms. sampler_states : openmmtools.states.SamplerState or list One or more sets of initial sampler states. If a list of SamplerStates, they will be assigned to replicas in a round-robin fashion. storage : str or instanced Reporter If str: the path to the storage file. Default checkpoint options from Reporter class are used If Reporter: Uses the reporter options and storage path In the future this will be able to take a Storage class as well. unsampled_thermodynamic_states : list of openmmtools.states.ThermodynamicState, optional These are ThermodynamicStates that are not propagated, but their reduced potential is computed at each iteration for each replica. These energy can be used as data for reweighting schemes (default is None). metadata : dict, optional Simulation metadata to be stored in the file. """ # Check if netcdf files exist. This is run only on MPI node 0 and # broadcasted. This is to avoid the case where the other nodes # arrive to this line after node 0 has already created the storage # file, causing an error. files_exist = False # Create temporary reporter if type(storage) is str: reporter = Reporter(storage, open_mode=None) file_string = storage else: reporter = storage file_string = reporter.filepath if mpi.run_single_node(0, reporter.storage_exists, broadcast_result=True): files_exist = True if files_exist: raise RuntimeError("Storage file {} already exists; cowardly " "refusing to overwrite.".format(file_string)) # Make sure sampler_states is an iterable of SamplerStates for later. if isinstance(sampler_states, mmtools.states.SamplerState): sampler_states = [sampler_states] # Make sure there are no more sampler states than thermodynamic states. if len(sampler_states) > len(thermodynamic_states): raise ValueError('Passed {} SamplerStates but only {} ThermodynamicStates'.format( len(sampler_states), len(thermodynamic_states))) # Make sure all states have same number of particles. We don't # currently support writing storage with different n_particles n_particles = thermodynamic_states[0].n_particles for states in [thermodynamic_states, sampler_states]: for state in states: if state.n_particles != n_particles: raise ValueError('All ThermodynamicStates and SamplerStates must ' 'have the same number of particles') # Handle default argument for metadata and add default simulation title. default_title = ('Replica-exchange simulation created using ReplicaExchange class ' 'of yank.repex.py on {}'.format(time.asctime(time.localtime()))) if metadata is None: metadata = dict(title=default_title) elif 'title' not in metadata: metadata['title'] = default_title self._metadata = metadata # Save thermodynamic states. This sets n_replicas. self._thermodynamic_states = copy.deepcopy(thermodynamic_states) # Handle default unsampled thermodynamic states. if unsampled_thermodynamic_states is None: self._unsampled_states = [] else: self._unsampled_states = copy.deepcopy(unsampled_thermodynamic_states) # Distribute sampler states to replicas in a round-robin fashion. self._sampler_states = [copy.deepcopy(sampler_states[i % len(sampler_states)]) for i in range(self.n_replicas)] # Map each thermodynamic state to a replica. Replica i at each iteration is # in ThermodynamicState thermodynamic_states[replica_thermodynamic_states[i]] # and SamplerState sampler_states[i]. During mixing, we exchange the indices # of the ThermodynamicState, but we keep the SamplerStates (i.e. positions, # velocities, box_vectors) bound to the same replica. self._replica_thermodynamic_states = np.array([i for i in range(self.n_replicas)], np.int64) # Assign default system box vectors if None has been specified. for replica_id, thermodynamic_state_id in enumerate(self._replica_thermodynamic_states): sampler_state = self._sampler_states[replica_id] if sampler_state.box_vectors is not None: continue thermodynamic_state = self._thermodynamic_states[thermodynamic_state_id] sampler_state.box_vectors = thermodynamic_state.system.getDefaultPeriodicBoxVectors() # Ensure there is an MCMCMove for each thermodynamic state. if isinstance(self._mcmc_moves, mmtools.mcmc.MCMCMove): self._mcmc_moves = [copy.deepcopy(self._mcmc_moves) for _ in range(self.n_replicas)] elif len(self._mcmc_moves) != self.n_replicas: raise RuntimeError('The number of MCMCMoves ({}) and ThermodynamicStates ({}) must ' 'be the same.'.format(len(self._mcmc_moves), self.n_replicas)) # Reset iteration counter. self._iteration = 0 # Reset statistics. # _n_accepted_matrix[i][j] is the number of swaps proposed between thermodynamic states i and j. # _n_proposed_matrix[i][j] is the number of swaps proposed between thermodynamic states i and j. self._n_accepted_matrix = np.zeros([self.n_replicas, self.n_replicas], np.int64) self._n_proposed_matrix = np.zeros([self.n_replicas, self.n_replicas], np.int64) # Allocate memory for energy matrix. energy_thermodynamic/unsampled_states[k][l] # is the reduced potential computed at the positions of SamplerState sampler_states[k] # and ThermodynamicState thermodynamic/unsampled_states[l]. self._energy_thermodynamic_states = np.zeros([self.n_replicas, self.n_replicas], np.float64) self._energy_unsampled_states = np.zeros([self.n_replicas, len(self._unsampled_states)], np.float64) # Display papers to be cited. self._display_citations() # Close the reporter file so its ready for use reporter.close() self._reporter = reporter self._initialize_reporter()
@mmtools.utils.with_timer('Minimizing all replicas')
[docs] def minimize(self, tolerance=1.0*unit.kilojoules_per_mole/unit.nanometers, max_iterations=0): """Minimize all replicas. Minimized positions are stored at the end. Parameters ---------- tolerance : simtk.unit.Quantity, optional Minimization tolerance (units of energy/mole/length, default is ``1.0 * unit.kilojoules_per_mole / unit.nanometers``). max_iterations : int, optional Maximum number of iterations for minimization. If 0, minimization continues until converged. """ # Check that simulation has been created. if self.n_replicas == 0: raise RuntimeError('Cannot minimize replicas. The simulation must be created first.') logger.debug("Minimizing all replicas...") # Distribute minimization across nodes. Only node 0 will get all positions. # The other nodes, only need the positions that they use for propagation and # computation of the energy matrix entries. minimized_positions, sampler_state_ids = mpi.distribute(self._minimize_replica, range(self.n_replicas), tolerance, max_iterations, send_results_to=0) # Update all sampler states. For non-0 nodes, this will update only the # sampler states associated to the replicas propagated by this node. for sampler_state_id, minimized_pos in zip(sampler_state_ids, minimized_positions): self._sampler_states[sampler_state_id].positions = minimized_pos # Save the stored positions in the storage mpi.run_single_node(0, self._reporter.write_sampler_states, self._sampler_states, self._iteration)
[docs] def equilibrate(self, n_iterations, mcmc_moves=None): """Equilibrate all replicas. This does not increase the iteration counter. The equilibrated positions are stored at the end. Parameters ---------- n_iterations : int Number of equilibration iterations. mcmc_moves : MCMCMove or list of MCMCMove, optional Optionally, the MCMCMoves to use for equilibration can be different from the ones used in production. """ # Check that simulation has been created. if self.n_replicas == 0: raise RuntimeError('Cannot minimize replicas. The simulation must be created first.') # If no MCMCMove is specified, use the ones for production. if mcmc_moves is None: mcmc_moves = self._mcmc_moves # Make sure there is one MCMCMove per thermodynamic state. if isinstance(mcmc_moves, mmtools.mcmc.MCMCMove): mcmc_moves = [copy.deepcopy(mcmc_moves) for _ in range(self.n_replicas)] elif len(mcmc_moves) != self.n_replicas: raise RuntimeError('The number of MCMCMoves ({}) and ThermodynamicStates ({}) for equilibration' ' must be the same.'.format(len(self._mcmc_moves), self.n_replicas)) # Temporarily set the equilibration MCMCMoves. production_mcmc_moves = self._mcmc_moves self._mcmc_moves = mcmc_moves for iteration in range(n_iterations): logger.debug("Equilibration iteration {}/{}".format(iteration, n_iterations)) self._propagate_replicas() # Restore production MCMCMoves. self._mcmc_moves = production_mcmc_moves # Update stored positions. mpi.run_single_node(0, self._reporter.write_sampler_states, self._sampler_states, self._iteration)
[docs] def run(self, n_iterations=None): """Run the replica-exchange simulation. This runs at most ``number_of_iterations`` iterations. Use :func:`extend` to pass the limit. Parameters ---------- n_iterations : int, optional If specified, only at most the specified number of iterations will be run (default is None). """ # If this is the first iteration, compute and store the # starting energies of the minimized/equilibrated structures. if self._iteration == 0: try: self._compute_energies() # We're intercepting a possible initial NaN position here thrown by OpenMM, which is a simple exception # So we have to under-specify this trap. except Exception as e: if 'coordinate is nan' in str(e).lower(): err_message = "Initial coordinates were NaN! Check your inputs!" logger.critical(err_message) raise utils.SimulationNaNError(err_message) else: # If not the special case, raise the error normally raise e mpi.run_single_node(0, self._reporter.write_energies, self._energy_thermodynamic_states, self._energy_unsampled_states, self._iteration) self._check_nan_energy() timer = mmtools.utils.Timer() timer.start('Run ReplicaExchange') run_initial_iteration = self._iteration # Handle default argument and determine number of iterations to run. if n_iterations is None: iteration_limit = self._number_of_iterations - self._iteration else: iteration_limit = min(self._iteration + n_iterations, self._number_of_iterations) # Main loop. while not self._is_completed(iteration_limit): # Increment iteration counter. self._iteration += 1 logger.debug('Iteration {}/{}'.format(self._iteration, iteration_limit)) timer.start('Iteration') # Attempt replica swaps to sample from equilibrium permuation of # states associated with replicas. This step synchronizes replicas. self._replica_thermodynamic_states = self._mix_replicas() # Propagate replicas. self._propagate_replicas() # Compute energies of all replicas at all states. self._compute_energies() # Write iteration to storage file. self._report_iteration() # Compute online free energy if (self._online_analysis_interval is not None and self._iteration > self._online_analysis_minimum_iterations and self._iteration % self._online_analysis_interval == 0): # Executed only by node 0 and broadcasted to be used in _is_completed(). self._last_err_free_energy = self._run_online_analysis() # Show timing statistics if debug level is activated. if logger.isEnabledFor(logging.DEBUG): iteration_time = timer.stop('Iteration') partial_total_time = timer.partial('Run ReplicaExchange') time_per_iteration = partial_total_time / (self._iteration - run_initial_iteration) estimated_time_remaining = time_per_iteration * (iteration_limit - self._iteration) estimated_total_time = time_per_iteration * iteration_limit estimated_finish_time = time.time() + estimated_time_remaining logger.debug("Iteration took {:.3f}s.".format(iteration_time)) if estimated_time_remaining != float('inf'): logger.debug("Estimated completion in {}, at {} (consuming total wall clock time {}).".format( str(datetime.timedelta(seconds=estimated_time_remaining)), time.ctime(estimated_finish_time), str(datetime.timedelta(seconds=estimated_total_time)))) # Perform sanity checks to see if we should terminate here. self._check_nan_energy()
[docs] def extend(self, n_iterations): """Extend the simulation by the given number of iterations. Contrarily to :func:`run`, this will extend the number of iterations past ``number_of_iteration`` if requested. Parameters ---------- n_iterations : int The number of iterations to run. """ if self._iteration + n_iterations > self._number_of_iterations: # This MUST be assigned to a property or the storage won't be updated. self.number_of_iterations = self._iteration + n_iterations self.run(n_iterations)
def __repr__(self): """Return a 'formal' representation that can be used to reconstruct the class, if possible.""" # TODO: Can we make this a more useful expression? return "<instance of ReplicaExchange>" # Disabled until we have a better answer to this # def __str__(self): # """ # Show an 'informal' human-readable representation of the replica-exchange simulation. # """ # r = "Replica-exchange simulation\n" # r += "\n" # r += "{:d} replicas\n".format(self.nreplicas) # r += "{:d} coordinate sets provided\n".format(len(self.provided_positions)) # r += "file store: {:s}\n".format(self.store_filename) # r += "initialized: {:s}\n".format(self._initialized) # r += "\n" # r += "PARAMETERS\n" # r += "collision rate: {:s}\n".format(self.collision_rate) # r += "relative constraint tolerance: {:s}\n".format(self.constraint_tolerance) # r += "timestep: {:s}\n".format(self.timestep) # r += "number of steps/iteration: {:d}\n".format(self.nsteps_per_iteration) # r += "number of iterations: {:d}\n".format(self._number_of_iterations) # if self.extend_simulation: # r += "Iterations extending existing data.\n" # r += "equilibration timestep: {:s}\n".format(self.equilibration_timestep) # r += "number of equilibration iterations: {:d}\n".format(self.number_of_equilibration_iterations) # r += "\n" # # return r def __del__(self): # The reporter could be None if ReplicaExchange was not created. if self._reporter is not None: mpi.run_single_node(0, self._reporter.close) # ------------------------------------------------------------------------- # Internal-usage. # ------------------------------------------------------------------------- def _check_nan_energy(self): """Checks that energies are finite and abort otherwise. Checks both sampled and unsampled thermodynamic states. """ # Check thermodynamic state energies. if np.any(np.isnan(self._energy_thermodynamic_states)) or np.any(np.isnan(self._energy_unsampled_states)): # Find faulty replicas to create error message. faulty_replicas = set() # Check sampled thermodynamic states first. state_type = 'thermodynamic state' for replica_id in range(self.n_replicas): if np.any(np.isnan(self._energy_thermodynamic_states[replica_id])): faulty_replicas.add(replica_id) # If there are no NaNs in energies, the problem is in the unsampled states. if len(faulty_replicas) == 0: state_type = 'unsampled thermodynamic state' for replica_id in range(self.n_replicas): if np.any(np.isnan(self._unsampled_states[replica_id])): faulty_replicas.add(replica_id) # Raise exception. err_msg = "NaN encountered in {} energies for replicas {}".format(state_type, faulty_replicas) logger.critical(err_msg) raise utils.SimulationNaNError(err_msg) @mpi.on_single_node(rank=0, broadcast_result=False, sync_nodes=False) def _display_citations(self, overwrite_global=False): """ Display papers to be cited. The overwrite_golbal command will force the citation to display even if the "have_citations_been_shown" variable is True """ # TODO Add original citations for various replica-exchange schemes. # TODO Show subset of OpenMM citations based on what features are being used. openmm_citations = """\ Friedrichs MS, Eastman P, Vaidyanathan V, Houston M, LeGrand S, Beberg AL, Ensign DL, Bruns CM, and Pande VS. Accelerating molecular dynamic simulations on graphics processing unit. J. Comput. Chem. 30:864, 2009. DOI: 10.1002/jcc.21209 Eastman P and Pande VS. OpenMM: A hardware-independent framework for molecular simulations. Comput. Sci. Eng. 12:34, 2010. DOI: 10.1109/MCSE.2010.27 Eastman P and Pande VS. Efficient nonbonded interactions for molecular dynamics on a graphics processing unit. J. Comput. Chem. 31:1268, 2010. DOI: 10.1002/jcc.21413 Eastman P and Pande VS. Constant constraint matrix approximation: A robust, parallelizable constraint method for molecular simulations. J. Chem. Theor. Comput. 6:434, 2010. DOI: 10.1021/ct900463w""" gibbs_citations = """\ Chodera JD and Shirts MR. Replica exchange and expanded ensemble simulations as Gibbs sampling: Simple improvements for enhanced mixing. J. Chem. Phys., 135:194110, 2011. DOI:10.1063/1.3660669""" mbar_citations = """\ Shirts MR and Chodera JD. Statistically optimal analysis of samples from multiple equilibrium states. J. Chem. Phys. 129:124105, 2008. DOI: 10.1063/1.2978177""" if overwrite_global or (not self._have_displayed_citations_before and not self._global_citation_silence): print("Please cite the following:") print("") print(openmm_citations) if self._replica_mixing_scheme == 'swap-all': print(gibbs_citations) self._have_displayed_citations_before = True # ------------------------------------------------------------------------- # Internal-usage: Initialization and storage utilities. # ------------------------------------------------------------------------- @staticmethod def _does_file_exist(file_path): """Check if there is a file at the given path.""" return os.path.exists(file_path) and os.path.getsize(file_path) > 0 @mpi.on_single_node(rank=0, broadcast_result=False, sync_nodes=True) def _initialize_reporter(self): """Initialize the reporter and store initial information. This is executed only on MPI node 0 and it is blocking. This is to avoid the case where the other nodes skip ahead and try to read from a file that hasn't been created yet. """ self._reporter.open(mode='w') self._reporter.write_thermodynamic_states(self._thermodynamic_states, self._unsampled_states) # Store run metadata and ReplicaExchange options. self._store_options() self._reporter.write_dict('metadata', self._metadata) # Store initial conditions. This forces the storage to be synchronized. self._report_iteration() @mpi.on_single_node(rank=0, broadcast_result=False, sync_nodes=False) @mpi.delayed_termination @mmtools.utils.with_timer('Writing iteration information to storage') def _report_iteration(self): """Store positions, states, and energies of current iteration. This is executed only on MPI node 0 and it's not blocking. The termination is delayed so that the file is not written only with partial data if the program gets interrupted. """ self._reporter.write_sampler_states(self._sampler_states, self._iteration) self._reporter.write_replica_thermodynamic_states(self._replica_thermodynamic_states, self._iteration) self._reporter.write_mcmc_moves(self._mcmc_moves) # MCMCMoves can store internal statistics. self._reporter.write_energies(self._energy_thermodynamic_states, self._energy_unsampled_states, self._iteration) self._reporter.write_mixing_statistics(self._n_accepted_matrix, self._n_proposed_matrix, self._iteration) self._reporter.write_timestamp(self._iteration) self._reporter.write_last_iteration(self._iteration) self._reporter.sync() def _store_options(self): """Store __init__ parameters (beside MCMCMoves) in storage file.""" logger.debug("Storing general ReplicaExchange options...") # Inspect __init__ parameters to store. parameter_names, _, _, defaults = inspect.getargspec(self.__init__) # Retrieve and store options. options_to_store = {parameter_name: getattr(self, '_' + parameter_name) for parameter_name in parameter_names[-len(defaults):]} # We store the MCMCMoves separately. options_to_store.pop('mcmc_moves') self._reporter.write_dict('options', options_to_store) # ------------------------------------------------------------------------- # Internal-usage: Distributed tasks. # ------------------------------------------------------------------------- @mmtools.utils.with_timer('Propagating all replicas') def _propagate_replicas(self): """Propagate all replicas.""" # TODO Report on efficiency of dyanmics (fraction of time wasted to overhead). logger.debug("Propagating all replicas...") # Distribute propagation across nodes. Only node 0 will get all positions # and box vectors. The other nodes, only need the positions that they use # for propagation and computation of the energy matrix entries. propagated_states, replica_ids = mpi.distribute(self._propagate_replica, range(self.n_replicas), send_results_to=0) # Update all sampler states. For non-0 nodes, this will update only the # sampler states associated to the replicas propagated by this node. for replica_id, propagated_state in zip(replica_ids, propagated_states): propagated_positions, propagated_box_vectors = propagated_state # Unpack. self._sampler_states[replica_id].positions = propagated_positions self._sampler_states[replica_id].box_vectors = propagated_box_vectors # Gather all MCMCMoves statistics. All nodes must have these up-to-date # since they are tied to the ThermodynamicState, not the replica. all_statistics = mpi.distribute(self._get_replica_move_statistics, range(self.n_replicas), send_results_to='all') for replica_id in range(self.n_replicas): if len(all_statistics[replica_id]) > 0: thermodynamic_state_id = self._replica_thermodynamic_states[replica_id] self._mcmc_moves[thermodynamic_state_id].statistics = all_statistics[replica_id] def _propagate_replica(self, replica_id): """Propagate thermodynamic state associated to the given replica.""" # Retrieve thermodynamic, sampler states, and MCMC move of this replica. thermodynamic_state_id = self._replica_thermodynamic_states[replica_id] thermodynamic_state = self._thermodynamic_states[thermodynamic_state_id] mcmc_move = self._mcmc_moves[thermodynamic_state_id] sampler_state = self._sampler_states[replica_id] # Apply MCMC move. try: mcmc_move.apply(thermodynamic_state, sampler_state) except mmtools.mcmc.IntegratorMoveError as e: # Save NaNnig context and MCMove before aborting. output_dir = os.path.join(os.path.dirname(self._reporter.filepath), 'nan-error-logs') file_name = 'iteration{}-replica{}-state{}'.format(self._iteration, replica_id, thermodynamic_state_id) e.serialize_error(os.path.join(output_dir, file_name)) message = ('This Repex simulation threw a NaN!\nLook for error logs in:\n' '\tDirectory: {}\tFile Name Base: {}').format(output_dir, file_name) logger.critical(message) raise utils.SimulationNaNError(message) # Return new positions and box vectors. return sampler_state.positions, sampler_state.box_vectors def _get_replica_move_statistics(self, replica_id): """Return the statistics of the MCMCMove currently associated to this replica.""" thermodynamic_state_id = self._replica_thermodynamic_states[replica_id] mcmc_move = self._mcmc_moves[thermodynamic_state_id] try: move_statistics = mcmc_move.statistics except AttributeError: move_statistics = {} return move_statistics def _minimize_replica(self, replica_id, tolerance, max_iterations): """Minimize the specified replica.""" # Retrieve thermodynamic and sampler states. thermodynamic_state_id = self._replica_thermodynamic_states[replica_id] thermodynamic_state = self._thermodynamic_states[thermodynamic_state_id] sampler_state = self._sampler_states[replica_id] # Retrieve a context. Any Integrator works. context, integrator = mmtools.cache.global_context_cache.get_context(thermodynamic_state) # Set initial positions and box vectors. sampler_state.apply_to_context(context) # Compute the initial energy of the system for logging. initial_energy = thermodynamic_state.reduced_potential(context) logger.debug('Replica {}/{}: initial energy {:8.3f}kT'.format( replica_id + 1, self.n_replicas, initial_energy)) # Minimize energy. openmm.LocalEnergyMinimizer.minimize(context, tolerance, max_iterations) # Get the minimized positions. sampler_state.update_from_context(context) # Compute the final energy of the system for logging. final_energy = thermodynamic_state.reduced_potential(sampler_state) logger.debug('Replica {}/{}: final energy {:8.3f}kT'.format( replica_id + 1, self.n_replicas, final_energy)) # Return minimized positions. return sampler_state.positions @mmtools.utils.with_timer('Computing energy matrix') def _compute_energies(self): """Compute energies of all replicas at all states.""" # Distribute energy computation across nodes. Only node 0 receives # all the energies since it needs to store them and mix states. new_energies, replica_ids = mpi.distribute(self._compute_replica_energies, range(self.n_replicas), send_results_to=0) # Update energy matrices. Non-0 nodes update only the energies computed by this replica. for replica_id, energies in zip(replica_ids, new_energies): energy_thermodynamic_states, energy_unsampled_states = energies # Unpack. self._energy_thermodynamic_states[replica_id] = energy_thermodynamic_states self._energy_unsampled_states[replica_id] = energy_unsampled_states def _compute_replica_energies(self, replica_id): """Compute the energy for the replica in every ThermodynamicState.""" # Initialize replica energies for each thermodynamic state. energy_thermodynamic_states = np.zeros(self.n_replicas) energy_unsampled_states = np.zeros(len(self._unsampled_states)) # Retrieve sampler state associated to this replica. sampler_state = self._sampler_states[replica_id] # Compute energy for all thermodynamic states. for energies, states in [(energy_thermodynamic_states, self._thermodynamic_states), (energy_unsampled_states, self._unsampled_states)]: for i, state in enumerate(states): # Check if we need to request a new Context. if i == 0 or not state.is_state_compatible(context_state): context_state = state # Get the context, any Integrator works. context, integrator = mmtools.cache.global_context_cache.get_context(state) # Update positions and box vectors. We don't need # to set Context velocities for the potential. sampler_state.apply_to_context(context, ignore_velocities=True) else: # If this state is compatible with the context, just fix the # thermodynamic state as positions/box vectors are the same. state.apply_to_context(context) # Compute energy. energies[i] = state.reduced_potential(context) # Return the new energies. return energy_thermodynamic_states, energy_unsampled_states # ------------------------------------------------------------------------- # Internal-usage: Replicas mixing. # ------------------------------------------------------------------------- @mpi.on_single_node(0, broadcast_result=True) def _mix_replicas(self): """Attempt to swap replicas according to user-specified scheme.""" logger.debug("Mixing replicas...") # Reset storage to keep track of swap attempts this iteration. self._n_accepted_matrix[:, :] = 0 self._n_proposed_matrix[:, :] = 0 # Perform swap attempts according to requested scheme. with mmtools.utils.time_it('Mixing of replicas'): if self.replica_mixing_scheme == 'swap-neighbors': self._mix_neighboring_replicas() elif self.replica_mixing_scheme == 'swap-all': # Try to use cython-accelerated mixing code if possible, # otherwise fall back to Python-accelerated code. try: self._mix_all_replicas_cython() except ValueError as e: logger.warning(e.message) self._mix_all_replicas() else: assert self.replica_mixing_scheme is None # Determine fraction of swaps accepted this iteration. n_swaps_proposed = self._n_proposed_matrix.sum() n_swaps_accepted = self._n_accepted_matrix.sum() swap_fraction_accepted = 0.0 if n_swaps_proposed > 0: # TODO drop casting to float when dropping Python 2 support. swap_fraction_accepted = float(n_swaps_accepted) / n_swaps_proposed logger.debug("Accepted {}/{} attempted swaps ({:.1f}%)".format(n_swaps_accepted, n_swaps_proposed, swap_fraction_accepted * 100.0)) # Return new states indices for MPI broadcasting. return self._replica_thermodynamic_states def _mix_all_replicas_cython(self): """Exchange all replicas with Cython-accelerated code.""" from .mixing._mix_replicas import _mix_replicas_cython replica_states = md.utils.ensure_type(self._replica_thermodynamic_states, np.int64, 1, "Replica States") u_kl = md.utils.ensure_type(self._energy_thermodynamic_states, np.float64, 2, "Reduced Potentials") n_proposed_matrix = md.utils.ensure_type(self._n_proposed_matrix, np.int64, 2, "Nij Proposed Swaps") n_accepted_matrix = md.utils.ensure_type(self._n_accepted_matrix, np.int64, 2, "Nij Accepted Swaps") _mix_replicas_cython(self.n_replicas**4, self.n_replicas, replica_states, u_kl, n_proposed_matrix, n_accepted_matrix) self._replica_thermodynamic_states = replica_states self._n_proposed_matrix = n_proposed_matrix self._n_accepted_matrix = n_accepted_matrix def _mix_all_replicas(self): """Exchange all replicas with Python.""" # Determine number of swaps to attempt to ensure thorough mixing. # TODO: Replace this with analytical result computed to guarantee sufficient mixing, or # TODO: adjust it based on how many we can afford to do and not have mixing take a # TODO: substantial fraction of iteration time. nswap_attempts = self.n_replicas**5 # Number of swaps to attempt (ideal, but too slow!) nswap_attempts = self.n_replicas**3 # Best compromise for pure Python? logger.debug("Will attempt to swap all pairs of replicas, using a total of %d attempts." % nswap_attempts) # Attempt swaps to mix replicas. for swap_attempt in range(nswap_attempts): # Choose random replicas uniformly to attempt to swap. replica_i = np.random.randint(self.n_replicas) replica_j = np.random.randint(self.n_replicas) self._attempt_swap(replica_i, replica_j) def _mix_neighboring_replicas(self): """Attempt exchanges between neighboring replicas only.""" logger.debug("Will attempt to swap only neighboring replicas.") # Attempt swaps of pairs of replicas using traditional scheme (e.g. [0,1], [2,3], ...). offset = np.random.randint(2) # Offset is 0 or 1. for thermodynamic_state_i in range(offset, self.n_replicas-1, 2): thermodynamic_state_j = thermodynamic_state_i + 1 # Neighboring state. # Determine which replicas currently hold the thermodynamic states. replica_i = np.where(self._replica_thermodynamic_states == thermodynamic_state_i) replica_j = np.where(self._replica_thermodynamic_states == thermodynamic_state_j) self._attempt_swap(replica_i, replica_j) def _attempt_swap(self, replica_i, replica_j): """Attempt a single exchange between two replicas.""" # Determine the thermodynamic states associated to these replicas. thermodynamic_state_i = self._replica_thermodynamic_states[replica_i] thermodynamic_state_j = self._replica_thermodynamic_states[replica_j] # Compute log probability of swap. energy_ij = self._energy_thermodynamic_states[replica_i, thermodynamic_state_j] energy_ji = self._energy_thermodynamic_states[replica_j, thermodynamic_state_i] energy_ii = self._energy_thermodynamic_states[replica_i, thermodynamic_state_i] energy_jj = self._energy_thermodynamic_states[replica_j, thermodynamic_state_j] log_p_accept = - (energy_ij + energy_ji) + energy_ii + energy_jj # Record that this move has been proposed. self._n_proposed_matrix[thermodynamic_state_i, thermodynamic_state_j] += 1 self._n_proposed_matrix[thermodynamic_state_j, thermodynamic_state_i] += 1 # Accept or reject. if log_p_accept >= 0.0 or np.random.rand() < math.exp(log_p_accept): # Swap states in replica slots i and j. self._replica_thermodynamic_states[replica_i] = thermodynamic_state_j self._replica_thermodynamic_states[replica_j] = thermodynamic_state_i # Accumulate statistics. self._n_accepted_matrix[thermodynamic_state_i, thermodynamic_state_j] += 1 self._n_accepted_matrix[thermodynamic_state_j, thermodynamic_state_i] += 1 # ------------------------------------------------------------------------- # Internal-usage: Online Analysis. # ------------------------------------------------------------------------- @mpi.on_single_node(rank=0, broadcast_result=True) @mpi.delayed_termination @mmtools.utils.with_timer('Computing online free energy estimate') def _run_online_analysis(self): """Compute the free energy during simulation run""" # This relative import is down here because having it at the top causes an ImportError. # __init__ pulls in repex, which pulls in analyze, which pulls in repex. Because the first repex never finished # importing, its not in the name space which causes relative analyze import of repex to crash as neither of them # are the __main__ package. # https://stackoverflow.com/questions/6351805/cyclic-module-dependencies-and-relative-imports-in-python from .analyze import ReplicaExchangeAnalyzer # Start the analysis bump_error_counter = False analysis = ReplicaExchangeAnalyzer(self._reporter, analysis_kwargs={'initial_f_k': self._last_mbar_f_k}) # Indices for online analysis, "i'th index, j'th index" idx, jdx = 0, -1 timer = mmtools.utils.Timer() timer.start("MBAR") logger.debug("Computing free energy with MBAR...") try: # Trap errors for MBAR being under sampled and the W_nk matrix not being normalized correctly mbar = analysis.mbar free_energy, err_free_energy = analysis.get_free_energy() except ParameterError as e: # We don't update self._last_err_free_energy here since if it # wasn't below the target threshold before, it won't stop repex now. bump_error_counter = True self._online_error_bank.append(e) else: self._last_mbar_f_k = mbar.f_k free_energy = free_energy[idx, jdx] self._last_err_free_energy = err_free_energy[idx, jdx] logger.debug("Current Free Energy Estimate is {} +- {} kT".format(free_energy, self._last_err_free_energy)) # Trap a case when errors don't converge (usually due to under sampling) if np.isnan(self._last_err_free_energy): self._last_err_free_energy = np.inf timer.stop("MBAR") # Raise an exception after 6 times MBAR gave an error. if bump_error_counter: self._online_error_trap_counter += 1 if self._online_error_trap_counter >= 6: logger.debug("Thrown MBAR Errors:") for err in self._online_error_bank: logger.debug(str(err)) raise RuntimeError("Online Analysis has failed too many times! Please " "check the latest logs to see the thrown errors!") # Don't write out the free energy in case of error. return # Write out the numbers self._reporter.write_mbar_free_energies(self._iteration, self._last_mbar_f_k, (free_energy, self._last_err_free_energy)) # Broadcast the last free energy used to determine completion. return self._last_err_free_energy @staticmethod def _get_last_written_free_energy(reporter, iteration): """Get the last free energy computed from online analysis""" last_f_k = None last_free_energy = None try: for index in range(iteration, 0, -1): last_f_k, last_free_energy = reporter.read_mbar_free_energies(index) # Find an f_k that is not all zeros (or masked and empty) if not (np.ma.is_masked(last_f_k) or np.all(last_f_k == 0)): break # Don't need to continue the loop if we already found one except (IndexError, KeyError): # No such f_k written yet (or variable created), no need to do anything pass return last_f_k, last_free_energy def _is_completed(self, iteration_limit=None): """Check if we have reached the required number of iterations or statistical error.""" if iteration_limit is None: iteration_limit = self._number_of_iterations # Return if we have reached the number of iterations # or the statistical error target required. if (self._iteration >= iteration_limit or (self._last_err_free_energy is not None and self._last_err_free_energy <= self._online_analysis_target_error)): return True return False # ------------------------------------------------------------------------- # Internal-usage: Test globals # ------------------------------------------------------------------------- _global_citation_silence = False
# ============================================================================== # PARALLEL TEMPERING # ==============================================================================
[docs]class ParallelTempering(ReplicaExchange): """Parallel tempering simulation facility. This class provides a facility for parallel tempering simulations. It is a subclass of :class:`ReplicaExchange`, but provides efficiency improvements for parallel tempering simulations, so should be preferred for this type of simulation. In particular, this makes use of the fact that the reduced potentials are linear in inverse temperature. Examples -------- Create the system. >>> from simtk import unit >>> from openmmtools import testsystems, states, mcmc >>> testsystem = testsystems.AlanineDipeptideImplicit() Create thermodynamic states for parallel tempering with exponentially-spaced schedule. >>> n_replicas = 3 # Number of temperature replicas. >>> T_min = 298.0 * unit.kelvin # Minimum temperature. >>> T_max = 600.0 * unit.kelvin # Maximum temperature. >>> reference_state = states.ThermodynamicState(system=testsystem.system, temperature=T_min) Initialize simulation object with options. Run with a GHMC integrator. >>> move = mcmc.GHMCMove(timestep=2.0*unit.femtoseconds, n_steps=50) >>> simulation = ParallelTempering(mcmc_moves=move, number_of_iterations=2) Create simulation with its storage file (in a temporary directory) and run. >>> storage_path = tempfile.NamedTemporaryFile(delete=False).name + '.nc' >>> reporter = Reporter(storage_path, checkpoint_interval=10) >>> simulation.create(thermodynamic_states=thermodynamic_states, >>> sampler_states=states.SamplerState(testsystem.positions), ... storage=reporter, min_temperature=T_min, ... max_temperature=T_max, n_temperatures=n_replicas) >>> simulation.run(n_iterations=1) Clean up. >>> os.remove(storage_path) See Also -------- ReplicaExchange """
[docs] def create(self, thermodynamic_state, sampler_states, storage, min_temperature=None, max_temperature=None, n_temperatures=None, temperatures=None, metadata=None): """Initialize a parallel tempering simulation object. Parameters ---------- thermodynamic_state : openmmtools.states.ThermodynamicState Reference thermodynamic state that will be simulated at the given temperatures. sampler_states : openmmtools.states.SamplerState or list One or more sets of initial sampler states. If a list of SamplerStates, they will be assigned to replicas in a round-robin fashion. storage : str or Reporter If str: path to the storage file, checkpoint options are default If Reporter: Instanced :class:`Reporter` class, checkpoint information is read from In the future this will be able to take a Storage class as well. min_temperature : simtk.unit.Quantity, optional Minimum temperature (units of temperature, default is None). max_temperature : simtk.unit.Quantity, optional Maximum temperature (units of temperature, default is None). n_temperatures : int, optional Number of exponentially-spaced temperatures between ``min_temperature`` and ``max_temperature`` (default is None). temperatures : list of simtk.unit.Quantity, optional If specified, this list of temperatures will be used instead of ``min_temperature``, ``max_temperature``, and ``n_temperatures`` (units of temperature, default is None). metadata : dict, optional Simulation metadata to be stored in the file. Notes ----- Either (``min_temperature``, ``max_temperature``, ``n_temperatures``) must all be specified or the list of '`temperatures`' must be specified. """ # Create thermodynamic states from temperatures. if temperatures is not None: logger.debug("Using provided temperatures") elif min_temperature is not None and max_temperature is not None and n_temperatures is not None: # TODO drop casting to float when dropping Python 2 support. temperatures = [min_temperature + (max_temperature - min_temperature) * (math.exp(i / float(n_temperatures-1)) - 1.0) / (math.e - 1.0) for i in range(n_temperatures)] logger.debug('using temperatures {}'.format(temperatures)) else: raise ValueError("Either 'temperatures' or 'min_temperature', 'max_temperature', " "and 'n_temperatures' must be provided.") thermodynamic_states = [copy.deepcopy(thermodynamic_state) for _ in range(n_temperatures)] for state, temperature in zip(thermodynamic_states, temperatures): state.temperature = temperature # Override default title. default_title = ('Parallel tempering simulation created using ParallelTempering ' 'class of yank.repex.py on {}'.format(time.asctime(time.localtime()))) if metadata is None: metadata = dict(title=default_title) elif 'title' not in metadata: metadata['title'] = default_title # Initialize replica-exchange simlulation. super(ParallelTempering, self).create(thermodynamic_states, sampler_states, storage=storage, metadata=metadata)
def _compute_replica_energies(self, replica_id): """Compute the energy for the replica at every temperature. Because only the temperatures differ among replicas, we replace the generic O(N^2) replica-exchange implementation with an O(N) implementation. """ # Initialize replica energies for each thermodynamic state. replica_energies = np.zeros(self.n_replicas) # Retrieve sampler states associated to this replica. sampler_state = self._sampler_states[replica_id] # Thermodynamic state differ only by temperatures. reference_thermodynamic_state = self._thermodynamic_states[0] # Get the context, any Integrator works. context, integrator = mmtools.cache.global_context_cache.get_context(reference_thermodynamic_state) # Update positions and box vectors. sampler_state.apply_to_context(context) # Compute energy. reference_reduced_potential = reference_thermodynamic_state.reduced_potential(context) # Strip reference potential of reference state's beta. reference_beta = 1.0 / (mmtools.constants.kB * reference_thermodynamic_state.temperature) reference_reduced_potential /= reference_beta # Update potential energy by temperature. for thermodynamic_state_id, thermodynamic_state in enumerate(self._thermodynamic_states): beta = 1.0 / (mmtools.constants.kB * thermodynamic_state.temperature) replica_energies[replica_id, thermodynamic_state_id] = beta * reference_reduced_potential # Return the new energies. return replica_energies
# ============================================================================== # MODULE INTERNAL USAGE UTILITIES # ============================================================================== class _DictYamlLoader(yaml.CLoader): """PyYAML Loader that reads !Quantity tags.""" def __init__(self, *args, **kwargs): super(_DictYamlLoader, self).__init__(*args, **kwargs) self.add_constructor(u'!Quantity', self.quantity_constructor) self.add_constructor(u'!ndarray', self.ndarray_constructor) @staticmethod def quantity_constructor(loader, node): loaded_mapping = loader.construct_mapping(node) data_unit = utils.quantity_from_string(loaded_mapping['unit']) data_value = loaded_mapping['value'] return data_value * data_unit @staticmethod def ndarray_constructor(loader, node): loaded_mapping = loader.construct_mapping(node, deep=True) data_type = np.dtype(loaded_mapping['type']) data_shape = loaded_mapping['shape'] data_values = loaded_mapping['values'] data = np.ndarray(shape=data_shape, dtype=data_type) if 0 not in data_shape: data[:] = data_values return data class _DictYamlDumper(yaml.CDumper): """PyYAML Dumper that handle simtk Quantities through !Quantity tags.""" def __init__(self, *args, **kwargs): super(_DictYamlDumper, self).__init__(*args, **kwargs) self.add_representer(unit.Quantity, self.quantity_representer) self.add_representer(np.ndarray, self.ndarray_representer) @staticmethod def quantity_representer(dumper, data): data_unit = data.unit data_value = data / data_unit data_dump = dict(unit=str(data_unit), value=data_value) return dumper.represent_mapping(u'!Quantity', data_dump) @staticmethod def ndarray_representer(dumper, data): """Convert a numpy array to native Python types.""" data_type = str(data.dtype) data_shape = data.shape data_values = data.tolist() data_dump = dict(type=data_type, shape=data_shape, values=data_values) return dumper.represent_mapping(u'!ndarray', data_dump) # ============================================================================== # MAIN AND TESTS # ============================================================================== if __name__ == "__main__": import doctest doctest.testmod()