Source code for moffragmentor.sbu.sbu

# -*- coding: utf-8 -*-
"""Representation for a secondary building block."""
import warnings
from collections import defaultdict
from typing import Collection, Dict, List, Optional, Tuple

import networkx as nx
import numpy as np
import pubchempy as pcp
from backports.cached_property import cached_property
from loguru import logger
from pymatgen.analysis.graphs import MoleculeGraph
from pymatgen.core import Molecule, PeriodicSite, Structure
from pymatgen.io.babel import BabelMolAdaptor
from rdkit import Chem
from scipy.spatial.distance import pdist

from moffragmentor.utils import pickle_dump
from moffragmentor.utils.mol_compare import mcs_rank


def ob_mol_without_metals(obmol):
    """Remove metals from an OpenBabel molecule."""
    import openbabel as ob

    mol = obmol.clone
    for atom in ob.OBMolAtomIter(mol.OBMol):
        if atom.IsMetal():
            mol.OBMol.DeleteAtom(atom)

    return mol


__all__ = ["SBU"]


def obmol_to_rdkit_mol(obmol):
    """Convert an OpenBabel molecule to a RDKit molecule."""
    smiles = obmol.write("can").strip()
    mol = Chem.MolFromSmiles(smiles, sanitize=True)
    if mol is None:
        warnings.warn("Attempting to remove metals to generate RDKit molecule")
        new_obmol = ob_mol_without_metals(obmol)
        smiles = new_obmol.write("can").strip()
        mol = Chem.MolFromSmiles(smiles, sanitize=True)

    return mol


[docs]class SBU:
    """Representation for a secondary building block.

    It also acts as container for site indices:

        * graph_branching_indices: are the branching indices according
            to the graph-based definition. They might not be part of the molecule.
        * binding_indices: are the indices of the sites between
            the branching index and metal
        * original_indices: complete original set of indices that has been selected
            for this building blocks

    .. note::

        The coordinates in the molecule object are not the ones directly
        extracted from the MOF. They are the coordinates of sites unwrapped
        to ensure that there are no "broken molecules" .

        To obtain the "original" coordinates, use the `_coordinates` attribute.

    .. note:: Dummy molecules

        In dummy molecules the binding and branching sites are replaces by
        dummy atoms (noble gas). They also have special properties that indicate
        the original species.

    Examples:
        >>> # visualize the molecule
        >>> sbu_object.show_molecule()
        >>> # search pubchem for the molecule
        >>> sbu_object.search_pubchem()
    """

    def __init__(
        self,
        molecule: Molecule,
        molecule_graph: MoleculeGraph,
        graph_branching_indices: Collection[int],
        binding_indices: Collection[int],
        molecule_original_indices_mapping: Optional[Dict[int, List[int]]] = None,
        dummy_molecule: Optional[Molecule] = None,
        dummy_molecule_graph: Optional[MoleculeGraph] = None,
        dummy_molecule_indices_mapping: Optional[Dict[int, List[int]]] = None,
        dummy_branching_indices: Optional[Collection[int]] = None,
    ):
        """Initialize a secondary building block.

        In practice, you won't use this constructor directly.

        Args:
            molecule (Molecule): Pymatgen molecule object.
            molecule_graph (MoleculeGraph): Pymatgen molecule graph object.
            graph_branching_indices (Collection[int]): Branching indices
                in the original structure.
            binding_indices (Collection[int]): Binding indices in the original structure.
            molecule_original_indices_mapping (Optional[Dict[int, List[int]]], optional):
                Mapping from molecule indices to original indices. Defaults to None.
            dummy_molecule (Optional[Molecule], optional): Dummy molecule. Defaults to None.
            dummy_molecule_graph (Optional[MoleculeGraph], optional): Dummy molecule graph.
                Defaults to None.
            dummy_molecule_indices_mapping (Optional[Dict[int, List[int]]], optional):
                Dummy molecule indices mapping. Defaults to None.
            dummy_branching_indices (Optional[Collection[int]], optional):
                Dummy branching indices. Defaults to None.
        """
        self.molecule = molecule
        self._mapping = molecule_original_indices_mapping

        self._indices = sum(list(molecule_original_indices_mapping.values()), [])

        self._original_indices = self._indices
        self.molecule_graph = molecule_graph
        self._original_graph_branching_indices = graph_branching_indices
        self._original_binding_indices = binding_indices

        self._dummy_molecule = dummy_molecule
        self._dummy_molecule_graph = dummy_molecule_graph
        self._dummy_molecule_indices_mapping = dummy_molecule_indices_mapping
        self._dummy_branching_indices = dummy_branching_indices

        self.mapping_from_original_indices = defaultdict(list)
        if molecule_original_indices_mapping is None:
            for ori_index, index in zip(self._indices, range(len(molecule))):
                self.mapping_from_original_indices[ori_index].append(index)
        else:
            for k, v in molecule_original_indices_mapping.items():
                for i in v:
                    self.mapping_from_original_indices[i].append(k)

        if dummy_molecule:
            self.dummy_mapping_from_original_indices = defaultdict(list)
            if dummy_molecule_indices_mapping is None:
                for ori_index, index in zip(self._indices, range(len(dummy_molecule))):
                    self.dummy_mapping_from_original_indices[ori_index].append(index)
            else:
                for k, v in dummy_molecule_indices_mapping.items():
                    for i in v:
                        self.dummy_mapping_from_original_indices[i].append(k)

        self.mapping_to_original_indices = {}
        for key, value in self.mapping_from_original_indices.items():
            for v in value:
                self.mapping_to_original_indices[v] = key

    @property
    def center(self):
        return self.molecule.center_of_mass

[docs]    def search_pubchem(self, listkey_counts: int = 10, **kwargs) -> Tuple[List[str], bool]:
        """Search for a molecule in pubchem # noqa: DAR401

        Second element of return tuple is true if there was an identity match

        Args:
            listkey_counts (int): Number of list keys to return
                (relevant for substructure search).
                Defaults to 10.
            kwargs: Additional arguments to pass to PubChem.search

        Returns:
            Tuple[List[str], bool]: List of pubchem ids and whether there was an identity match
        """
        try:
            matches = pcp.get_compounds(
                self.smiles, namespace="smiles", searchtype="fastidentity", **kwargs
            )
            if matches:
                return (
                    matches,
                    True,
                )
            else:
                raise ValueError("No matches found")
        except Exception:
            logger.warning(
                f"Could not find {self.smiles} in pubchem, \
                    now performing substructure search"
            )
            # we use `fastsubstructure` as it fixes
            # https://github.com/kjappelbaum/moffragmentor/issues/63
            res = pcp.get_compounds(
                self.smiles,
                namespace="smiles",
                searchtype="fastsubstructure",
                listkey_counts=listkey_counts,
                **kwargs,
            )
            smiles = [r.canonical_smiles for r in res]

            return mcs_rank(self.smiles, smiles, additional_attributes=res), False

[docs]    def get_neighbor_indices(self, site: int) -> List[int]:
        """Get list of indices of neighboring sites"""
        return [site.index for site in self.molecule_graph.get_connected_sites(site)]

    # make this function so we can have different flavors
    def get_indices(self):
        return self._indices

    @property
    def is_edge(self):
        return len(self.branching_coords) == 2

    def __len__(self):
        """Return the number of atoms in the molecule."""
        return len(self.molecule)

    def __str__(self):
        """Return the SMILES string for the molecule."""
        return self.smiles

    def dump(self, path):
        pickle_dump(self, path)

    def _get_nx_graph(self):
        return nx.Graph(self.molecule_graph.graph.to_undirected())

    @cached_property
    def metal_indices(self) -> List[int]:
        return [i for i, species in enumerate(self.molecule.species) if species.is_metal]

    @cached_property
    def nx_graph(self):
        return self._get_nx_graph()

    @property
    def composition(self):
        return self.molecule.composition.alphabetical_formula

    @property
    def cart_coords(self):
        return self.molecule.cart_coords

    @cached_property
    def mol_with_coords(self):
        mol = self.molecule.copy()
        mol.coords = self.cart_coords
        sites = mol.sites
        return Molecule.from_sites(sites)

    @property
    def coordination(self):
        return len(self.original_graph_branching_indices)

    @property
    def original_graph_branching_indices(self):
        return self._original_graph_branching_indices

    @property
    def graph_branching_indices(self):
        indices = []
        if self._dummy_branching_indices is None:
            for i in self.original_graph_branching_indices:
                for index in self.mapping_from_original_indices[i]:
                    indices.append(index)
        else:
            for i in self._dummy_branching_indices:
                for index in self.dummy_mapping_from_original_indices[i]:
                    indices.append(index)
        return indices

    @cached_property
    def weisfeiler_lehman_graph_hash(self):
        return nx.weisfeiler_lehman_graph_hash(self.molecule_graph.graph, node_attr="specie")

    @property
    def molar_mass(self):
        return self.molecule.composition.weight

    @cached_property
    def hash(self) -> str:
        """Return hash.

        The hash is a combination of Weisfeiler-Lehman graph hash and center.

        Returns:
            str: Hash.
        """
        wl_hash = self.weisfeiler_lehman_graph_hash
        center = self.molecule.cart_coords.mean(axis=0)
        return f"{wl_hash}-{center[0]:.2f}-{center[1]:.2f}-{center[2]:.2f}"

    def __eq__(self, other: "SBU") -> bool:
        """Check if two molecules are equal.

        Based on the Weisfeiler-Lehman graph hash and center of mass.

        Args:
            other (SBU): SBU to compare to.

        Returns:
            bool: True if equal, False otherwise.
        """
        if hash(self) != hash(other):
            return False
        return True

    @cached_property
    def branching_coords(self):
        return (
            self.cart_coords[self.graph_branching_indices]
            if self._dummy_branching_indices is None
            else self._dummy_molecule.cart_coords[self.graph_branching_indices]
        )

    @property
    def original_binding_indices(self):
        return self._original_binding_indices

    @cached_property
    def binding_indices(self):
        indices = []
        for i in self.original_binding_indices:
            for index in self.mapping_from_original_indices[i]:
                indices.append(index)
        return indices

    @cached_property
    def rdkit_mol(self):
        return obmol_to_rdkit_mol(self.openbabel_mol)

    @cached_property
    def openbabel_mol(self):
        return self.get_openbabel_mol()

    def get_openbabel_mol(self):
        from openbabel import pybel as pb

        a = BabelMolAdaptor(self.molecule)
        pm = pb.Molecule(a.openbabel_mol)
        return pm

    def show_molecule(self):
        import nglview

        return nglview.show_pymatgen(self.molecule)

    def show_connecting_structure(self):
        import nglview

        return nglview.show_pymatgen(self._get_connected_sites_structure())

    def show_binding_structure(self):
        import nglview

        return nglview.show_pymatgen(self._get_binding_sites_structure())

    def to(self, fmt: str, filename: str):
        return self.molecule.to(fmt, filename)

    @cached_property
    def smiles(self) -> str:
        """Return canonical SMILES.

        Use openbabel to compute the SMILES, but then get the
        canonical version with RDKit as we observed sometimes the same
        molecule ends up as different canonical SMILES for openbabel.
        If RDKit cannot make a canonical SMILES (can happen with organometallics)
        we simply use the openbabel version.

        Returns:
            str: Canonical SMILES
        """
        mol = self.openbabel_mol
        smiles = mol.write("can").strip()
        try:
            canonical = Chem.CanonSmiles(smiles)
            return canonical
        except Exception:
            return smiles

    def _get_boxed_structure(self, mol=None) -> Structure:
        if mol is None:
            mol = self.molecule
        max_size = _get_max_sep(mol.cart_coords)
        structure = mol.get_boxed_structure(
            max_size + 0.1 * max_size,
            max_size + 0.1 * max_size,
            max_size + 0.1 * max_size,
            reorder=False,
        )
        return structure

    def _get_binding_sites_structure(self) -> Structure:
        sites = []
        s = self._get_boxed_structure()
        for i in self.binding_indices:
            sites.append(s[i])
        return Structure.from_sites(sites)

    def _get_branching_sites_structure(self) -> Structure:
        new_sites = []
        s = self._get_boxed_structure(self._dummy_molecule)
        for i, site in enumerate(s):
            if set(self._dummy_molecule_indices_mapping[i]) & self._dummy_branching_indices:
                if "original_species" in site.properties and site.properties["original_species"]:
                    species = site.properties["original_species"]
                else:
                    species = site.species

                new_site = PeriodicSite(species, site.coords, site.lattice)
                new_sites.append(new_site)

        return Structure.from_sites(new_sites)

    @cached_property
    def descriptors(self):
        return self._get_descriptors()


def _get_max_sep(coordinates):
    if len(coordinates) > 1:
        distances = pdist(coordinates)
        return np.max(distances)
    return 5