From 0d6fbbccaa8dd95990092fc588e5a657b81e8c99 Mon Sep 17 00:00:00 2001 From: CalCraven <54594941+CalCraven@users.noreply.github.com> Date: Mon, 29 May 2023 12:32:52 -0500 Subject: [PATCH] Methods for converting to Pandas Dataframes (#524) * add base functions for converting to dataframes * update atomtypes_to_dataframes so it can handle labels that need to be split at a '.' * add functions for plotting connection parameters * add docstrings for unit conversions, create a function that includes specified labels in the dataframes objest * apply black * add associated unyt tests * remove unused variable * fix charge output from bonds * Add unit tests for labels with a and parameter sites * fix bug with testing charges read into pandas df * update position values used in test__pandas_df * delete changes brought in for networkx visualization PR * fixes for testing bugs * delete requirements-test.txt * add generalization for parameters in different bonds, angles, dihedrals. Also, add a unyts_bool flag to allow someone to output numeric quantities as unyt quantities or floats * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * change function name from to_datatables to data_frame * address reviews and add a few modular functions to break up into smaller bits. Add improper support * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update gmso/core/topology.py Co-authored-by: Umesh Timalsina * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * use unyt_to_dict to handle printing units in tables * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix units on charge conversions * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * allow for optional pandas dependency * replace labels argument for to_dataframes function with site_attrs to make it more clear what is being output to the dataframe * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fix bug in gmso/tests/test_topology.py test_to_dataframe * remove has_pandas import in test_topology.py * Push import_ for soft import of pandas directly into to_dataframe function within topology.py * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update gmso/tests/test_topology.py Co-authored-by: Umesh Timalsina * Add pandas dependency to environment-dev.yml for testing * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * replace missing code in index connections function. * Update gmso/core/topology.py Co-authored-by: Co Quach <43968221+daico007@users.noreply.github.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Suggestion from Co for reduction in code repeats Co-authored-by: Co Quach <43968221+daico007@users.noreply.github.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Use f string formatting Co-authored-by: Co Quach <43968221+daico007@users.noreply.github.com> * f string formatting Co-authored-by: Co Quach <43968221+daico007@users.noreply.github.com> * f string formatting Co-authored-by: Co Quach <43968221+daico007@users.noreply.github.com> * f string formatting Co-authored-by: Co Quach <43968221+daico007@users.noreply.github.com> * f string formatting Co-authored-by: Co Quach <43968221+daico007@users.noreply.github.com> * Fixes for more f-string formatting, added default unit registry for conversion of elementary_charge units * Add tests for unit registry module adding new unit values * fixes to default registry calls --------- Co-authored-by: Co Quach <43968221+daico007@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Justin Gilmer Co-authored-by: Umesh Timalsina --- environment-dev.yml | 1 + gmso/core/topology.py | 274 ++++++++++++++++++++++++++++++++++++ gmso/tests/test_topology.py | 125 +++++++++++++++- gmso/utils/io.py | 8 ++ gmso/utils/units.py | 90 ++++++++++++ 5 files changed, 497 insertions(+), 1 deletion(-) create mode 100644 gmso/utils/units.py diff --git a/environment-dev.yml b/environment-dev.yml index 5e47668a4..f76b8577a 100644 --- a/environment-dev.yml +++ b/environment-dev.yml @@ -24,6 +24,7 @@ dependencies: - ipywidgets - ele>=0.2.0 - pre-commit + - pandas - symengine - python-symengine - hoomd>=3 diff --git a/gmso/core/topology.py b/gmso/core/topology.py index f0ce79107..4de86a658 100644 --- a/gmso/core/topology.py +++ b/gmso/core/topology.py @@ -9,6 +9,7 @@ import gmso from gmso.abc.abstract_site import Site +from gmso.abc.serialization_utils import unyt_to_dict from gmso.core.angle import Angle from gmso.core.angle_type import AngleType from gmso.core.atom import Atom @@ -25,6 +26,7 @@ from gmso.utils.connectivity import ( identify_connections as _identify_connections, ) +from gmso.utils.units import GMSO_UnitRegsitry as UnitReg scaling_interaction_idxes = {"12": 0, "13": 1, "14": 2} @@ -1157,6 +1159,87 @@ def get_index(self, member): return index + def to_dataframe(self, parameter="sites", site_attrs=None, unyts_bool=True): + """Return a pandas dataframe object for the sites in a topology + + Parameters + ---------- + parameter : str, default='sites' + A string determining what aspects of the gmso topology will be reported. + Options are: 'sites', 'bonds', 'angles', 'dihedrals', and 'impropers'. Defaults to 'sites'. + site_attrs : list of str, default=None + List of strings that are attributes of the topology site and can be included as entries in the pandas dataframe. + Examples of these can be found by printing `topology.sites[0].__dict__`. + See https://gmso.mosdef.org/en/stable/data_structures.html#gmso.Atom for additional information on labeling. + unyts_bool: bool, default=True + Determine if numerical values are saved as unyt quantities or floats. See + https://unyt.readthedocs.io/en/stable/usage.html + for more information about manipulating unyt quantities. + Default is True. + + Returns + ------- + Pandas Dataframe + A pandas.Dataframe object, see https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html + for further information. + + Examples + ________ + >>> topology.to_dataframe(parameter = 'sites', site_attrs = ['charge']) + This will return a dataframe with a listing of the sites and include the charges that correspond to each site. + >>> topology.to_dataframe(parameter = 'dihedrals', site_attrs = ['positions']) + This will return a dataframe with a listing of the sites that make up each dihedral, the positions of each of + those sites, and the parameters that are associated with the dihedrals. + + Notes + ____ + A dataframe is easily manipulated. In order to change the rounding to two decimals places for a column named `label`: + >>> df['label'] = df['label'].round(2) + The column labels can also be easily modified. This line can take a dataframe `df` and rename a column labeled + `Atom0` to `newname` using a dictionary. + >>> df.rename(columns = {'Atom0':'newname'}) + See https://pandas.pydata.org/pandas-docs/stable/getting_started/intro_tutorials/index.html for further information. + """ + from gmso.utils.io import import_ + + pd = import_("pandas") + if not site_attrs: + site_attrs = [] + df = pd.DataFrame() + if not self.is_typed(): + raise GMSOError( + "This topology is not typed, please type this object before converting to a pandas dataframe" + ) + if parameter == "sites": + df["atom_types"] = list(site.atom_type.name for site in self.sites) + df["names"] = list(site.name for site in self.sites) + for attr in site_attrs: + df = self._parse_dataframe_attrs( + df, attr, parameter, unyts_bool + ) + elif parameter in ["bonds", "angles", "dihedrals", "impropers"]: + if len(getattr(self, parameter)) == 0: + raise GMSOError( + f"There arent any {parameter} in the topology. The dataframe would be empty." + ) + df = self._pandas_from_parameters( + df, + parameter=parameter, + site_attrs=site_attrs, + unyts_bool=unyts_bool, + ) + df = self._parse_parameter_expression(df, parameter, unyts_bool) + else: + raise AttributeError( + "{} is not yet supported for outputting parameters to a dataframe. \ + Please use one of 'sites', 'bonds', 'angles', 'dihedrals', or \ + 'impropers'".format( + str(parameter) + ) + ) + + return df + def get_forcefield(self): """Get an instance of gmso.ForceField out of this topology @@ -1417,6 +1500,190 @@ def __str__(self): """Return custom format to represent topology as a string.""" return f"" + def _pandas_from_parameters( + self, df, parameter, site_attrs=None, unyts_bool=True + ): + """Add to a pandas dataframe the site indices for each connection member in a + multimember topology attribute such as a bond. Also include information about + those sites in the site_attrs list""" + if site_attrs is None: + site_attrs = [] + sites_per_connection = len( + getattr(self, parameter)[0].connection_members + ) + for site_index in np.arange(sites_per_connection): + df["Atom" + str(site_index)] = list( + str(connection.connection_members[site_index].name) + + f"({self.get_index(connection.connection_members[site_index])})" + for connection in getattr(self, parameter) + ) + for attr in site_attrs: + df = self._parse_dataframe_attrs( + df, attr, parameter, sites_per_connection, unyts_bool + ) + return df + + def _parse_dataframe_attrs( + self, df, attr, parameter, sites_per_connection=1, unyts_bool=True + ): + """Parses an attribute string to correctly format and return the topology attribute + into a pandas dataframe""" + if parameter == "sites": + if "." in attr: + try: + attr1, attr2 = attr.split(".") + df[attr] = list( + _return_float_for_unyt( + getattr(getattr(site, attr1), attr2), + unyts_bool, + ) + for site in self.sites + ) + except AttributeError: + raise AttributeError( + f"The attribute {attr} is not in this gmso object." + ) + elif attr == "positions" or attr == "position": + for i, dimension in enumerate(["x", "y", "z"]): + df[dimension] = list( + _return_float_for_unyt( + getattr(site, "position")[i], unyts_bool + ) + for site in self.sites + ) + elif attr == "charge" or attr == "charges": + df["charge (e)"] = list( + site.charge.in_units( + u.Unit( + "elementary_charge", registry=UnitReg.default_reg() + ) + ).to_value() + for site in self.sites + ) + else: + try: + df[attr] = list( + _return_float_for_unyt(getattr(site, attr), unyts_bool) + for site in self.sites + ) + except AttributeError: + raise AttributeError( + f"The attribute {attr} is not in this gmso object." + ) + + elif parameter in ["bonds", "angles", "dihedrals", "impropers"]: + for site_index in np.arange(sites_per_connection): + if "." in attr: + try: + attr1, attr2 = attr.split(".") + df[attr + " Atom" + str(site_index)] = list( + _return_float_for_unyt( + getattr( + getattr( + connection.connection_members[ + site_index + ], + attr1, + ), + attr2, + ), + unyts_bool, + ) + for connection in getattr(self, parameter) + ) + except AttributeError: + raise AttributeError( + f"The attribute {attr} is not in this gmso object." + ) + elif attr == "positions" or attr == "position": + df["x Atom" + str(site_index) + " (nm)"] = list( + _return_float_for_unyt( + getattr( + connection.connection_members[site_index], + "position", + )[0], + unyts_bool, + ) + for connection in getattr(self, parameter) + ) + df["y Atom" + str(site_index) + " (nm)"] = list( + _return_float_for_unyt( + getattr( + connection.connection_members[site_index], + "position", + )[1], + unyts_bool, + ) + for connection in getattr(self, parameter) + ) + df["z Atom" + str(site_index) + " (nm)"] = list( + _return_float_for_unyt( + getattr( + connection.connection_members[site_index], + "position", + )[2], + unyts_bool, + ) + for connection in getattr(self, parameter) + ) + elif attr == "charge" or attr == "charges": + df["charge Atom" + str(site_index) + " (e)"] = list( + getattr( + connection.connection_members[site_index], + "charge", + ) + .in_units( + u.Unit( + "elementary_charge", + registry=UnitReg.default_reg(), + ) + ) + .value + for connection in getattr(self, parameter) + ) + else: + try: + df[f"{attr} Atom {site_index}"] = list( + _return_float_for_unyt( + getattr( + connection.connection_members[site_index], + attr, + ), + unyts_bool, + ) + for connection in getattr(self, parameter) + ) + except AttributeError: + raise AttributeError( + f"The attribute {attr} is not in this gmso object." + ) + else: + raise AttributeError( + f"{parameter} is not yet supported for adding labels to a dataframe. \ + Please use one of 'sites', 'bonds', 'angles', 'dihedrals', or 'impropers'" + ) + return df + + def _parse_parameter_expression(self, df, parameter, unyts_bool): + """Take a given topology attribute and return the parameters associated with it""" + for i, param in enumerate( + getattr( + getattr(self, parameter)[0], parameter[:-1] + "_type" + ).parameters + ): + df[ + f"Parameter {i} ({param}) {getattr(getattr(self, parameter)[0], parameter[:-1]+'_type').parameters[param].units}" + ] = list( + _return_float_for_unyt( + getattr(connection, parameter[:-1] + "_type").parameters[ + param + ], + unyts_bool, + ) + for connection in getattr(self, parameter) + ) + return df + @classmethod def load(cls, filename, **kwargs): """Load a file to a topology""" @@ -1425,3 +1692,10 @@ def load(cls, filename, **kwargs): loader = LoadersRegistry.get_callable(filename.suffix) return loader(filename, **kwargs) + + +def _return_float_for_unyt(unyt_quant, unyts_bool): + try: + return unyt_quant if unyts_bool else unyt_to_dict(unyt_quant)["array"] + except TypeError: + return unyt_quant diff --git a/gmso/tests/test_topology.py b/gmso/tests/test_topology.py index 658f0ee28..70ec8c14b 100644 --- a/gmso/tests/test_topology.py +++ b/gmso/tests/test_topology.py @@ -20,7 +20,8 @@ from gmso.exceptions import GMSOError from gmso.external.convert_parmed import from_parmed from gmso.tests.base_test import BaseTest -from gmso.utils.io import get_fn, has_parmed, import_ +from gmso.utils.io import get_fn, has_pandas, has_parmed, import_ +from gmso.utils.units import GMSO_UnitRegsitry as UnitReg if has_parmed: pmd = import_("parmed") @@ -722,6 +723,106 @@ def test_topology_set_scaling_factors_none(self): with pytest.raises(ValueError): top.set_scaling_factors(None, None) + @pytest.mark.skipif(not has_pandas, reason="Pandas is not installed") + def test_to_dataframe(self, typed_ethane): + assert len(typed_ethane.to_dataframe()) == 8 + assert len(typed_ethane.to_dataframe(parameter="bonds")) == 7 + assert len(typed_ethane.to_dataframe(parameter="angles")) == 12 + assert len(typed_ethane.to_dataframe(parameter="dihedrals")) == 9 + assert np.isclose( + float( + typed_ethane.to_dataframe(site_attrs=["charge", "position"])[ + "charge (e)" + ][0] + ), + typed_ethane.sites[0] + .charge.in_units( + u.Unit("elementary_charge", registry=UnitReg.default_reg()) + ) + .to_value(), + ) + assert ( + typed_ethane.to_dataframe(site_attrs=["atom_type.name"])[ + "atom_type.name" + ][0] + == "opls_135" + ) + assert np.allclose( + float( + typed_ethane.to_dataframe(site_attrs=["charge", "position"])[ + "x" + ][0] + ), + 0, + ) + assert np.allclose( + float( + typed_ethane.to_dataframe( + parameter="bonds", site_attrs=["charge", "position"] + )["charge Atom0 (e)"][0] + ), + typed_ethane.bonds[0] + .connection_members[0] + .charge.in_units( + u.Unit("elementary_charge", registry=UnitReg.default_reg()) + ) + .to_value(), + ) + with pytest.raises(AttributeError) as e: + typed_ethane.to_dataframe(site_attrs=["missingattr"]) + assert ( + str(e.value) + == "The attribute missingattr is not in this gmso object." + ) + with pytest.raises(AttributeError) as e: + typed_ethane.to_dataframe(site_attrs=["missingattr.missingattr"]) + assert ( + str(e.value) + == "The attribute missingattr.missingattr is not in this gmso object." + ) + with pytest.raises(AttributeError) as e: + typed_ethane.to_dataframe(site_attrs=["missingattr.attr"]) + assert ( + str(e.value) + == "The attribute missingattr.attr is not in this gmso object." + ) + with pytest.raises(AttributeError) as e: + typed_ethane.to_dataframe( + parameter="bonds", site_attrs=["missingattr"] + ) + assert ( + str(e.value) + == "The attribute missingattr is not in this gmso object." + ) + with pytest.raises(AttributeError) as e: + typed_ethane.to_dataframe( + parameter="bonds", site_attrs=["missingattr.attr"] + ) + assert ( + str(e.value) + == "The attribute missingattr.attr is not in this gmso object." + ) + with pytest.raises(GMSOError) as e: + top = Topology() + top.to_dataframe(parameter="bonds") + assert ( + str(e.value) + == "There arent any bonds in the topology. The dataframe would be empty." + ) + + @pytest.mark.skipif(not has_pandas, reason="Pandas is not installed") + def test_pandas_from_parameters(self, typed_ethane): + pd = import_("pandas") + df = pd.DataFrame() + assert np.allclose( + float( + typed_ethane._pandas_from_parameters( + df, "bonds", ["positions"] + )["x Atom1 (nm)"][6] + ), + -0.03570001, + ) + def test_is_typed_check(self, typed_chloroethanol): groups = [ "sites", @@ -811,3 +912,25 @@ def test_write_forcefield(self, typed_water_system): top = Topology() with pytest.raises(GMSOError): top.get_forcefield() + + def test_units(self, typed_ethane): + reg = UnitReg() + assert np.isclose( + typed_ethane.sites[0] + .charge.in_units(u.Unit("elementary_charge", registry=reg.reg)) + .to_value(), + -0.18, + ) + conversion = ( + 10 * getattr(u.physical_constants, "elementary_charge").value + ) + reg.register_unit( + "test_charge", + conversion, + [u.dimensions.current_mks, u.dimensions.time], + ) + assert reg.reg["test_charge"] + assert_allclose_units( + 1.60217662e-19 * u.Coulomb, + 0.1 * u.Unit("test_charge", registry=reg.reg), + ) diff --git a/gmso/utils/io.py b/gmso/utils/io.py index f8ca2be4b..8dc3b632d 100644 --- a/gmso/utils/io.py +++ b/gmso/utils/io.py @@ -197,6 +197,14 @@ def import_(module): except ImportError: has_matplotlib = False +try: + import pandas + + has_pandas = True + del pandas +except ImportError: + has_pandas = False + def run_from_ipython(): """Verify that the code is running in an ipython kernel.""" diff --git a/gmso/utils/units.py b/gmso/utils/units.py new file mode 100644 index 000000000..066d0b23d --- /dev/null +++ b/gmso/utils/units.py @@ -0,0 +1,90 @@ +"""Source of available units registered within GMSO.""" + +import numpy as np +import unyt as u + + +class GMSO_UnitRegsitry(object): + """A default unit registry class. + + The basic units that need to be added for various unit conversions done + throughout GMSO. + + Attributes + ---------- + reg : u.UnitRegistry + The unit registry useful for conversions commonly used in molecular topologies + """ + + def __init__(self): + self.reg_ = u.UnitRegistry() + conversion = ( + 1 * getattr(u.physical_constants, "elementary_charge").value + ) + self.register_unit( + "elementary_charge", + conversion, + [u.dimensions.current_mks, u.dimensions.time], + r"\rm{e}", + ) + + def register_unit( + self, + name: str, + conversion: float, + dimensionsList: list, + tex_repr=None, + ): + """Add units to the self.reg UnitRegistry. + + Parameters + ---------- + registry : u.unyt_registy, required + Unit registry to add the unit to. See unyt.unyt_registry for more information + dimensionsList : list, required + A list of the dimensions that the unit will be registered under. If using the inverse of a dimension + be sure to supply 1/u.dimension as the element of the list. + conversion : float, required + The numerical value for the conversion in SI units with the same dimensions. See unyt.unyt_registry.add + module for more information + name : str, required + Then name of the unyt to be referenced as string when calling u.Unit("unit_name") + tex_repr : str, optional, default None + The latex representation that is used to visualze the unit when pretty print is used. + + + """ + dim = np.prod(dimensionsList) + if not tex_repr: + tex_repr = r"\rm{name}" + self.reg_.add( + symbol=name, + base_value=conversion, + dimensions=dim, + tex_repr=tex_repr, + ) + + @property + def reg(self): + """Return the UnitRegistry attribute for the class.""" + return self.__dict__.get("reg_") + + @staticmethod + def default_reg(): + """Return a default registry with extra units defined outside of unyt. + + Returns + ------- + reg : u.unyt_registy + A unyt registry with commonly used conversions defined. + """ + reg = u.UnitRegistry() + conversion = ( + 1 * getattr(u.physical_constants, "elementary_charge").value + ) + dimensionsList = [u.dimensions.current_mks, u.dimensions.time] + dim = np.prod(dimensionsList) + name = "elementary_charge" + symbol = r"\rm{e}" + reg.add(name, conversion, dim, symbol) + return reg