# -*- coding: utf-8 -*-
"""Representation for a secondary building block."""
import warnings
from collections import defaultdict
from typing import Collection, Dict, List, Optional, Tuple
import networkx as nx
import numpy as np
import pubchempy as pcp
from backports.cached_property import cached_property
from loguru import logger
from pymatgen.analysis.graphs import MoleculeGraph
from pymatgen.core import Molecule, PeriodicSite, Structure
from pymatgen.io.babel import BabelMolAdaptor
from rdkit import Chem
from scipy.spatial.distance import pdist
from moffragmentor.utils import pickle_dump
from moffragmentor.utils.mol_compare import mcs_rank
def ob_mol_without_metals(obmol):
"""Remove metals from an OpenBabel molecule."""
import openbabel as ob
mol = obmol.clone
for atom in ob.OBMolAtomIter(mol.OBMol):
if atom.IsMetal():
mol.OBMol.DeleteAtom(atom)
return mol
__all__ = ["SBU"]
def obmol_to_rdkit_mol(obmol):
"""Convert an OpenBabel molecule to a RDKit molecule."""
smiles = obmol.write("can").strip()
mol = Chem.MolFromSmiles(smiles, sanitize=True)
if mol is None:
warnings.warn("Attempting to remove metals to generate RDKit molecule")
new_obmol = ob_mol_without_metals(obmol)
smiles = new_obmol.write("can").strip()
mol = Chem.MolFromSmiles(smiles, sanitize=True)
return mol
[docs]class SBU:
"""Representation for a secondary building block.
It also acts as container for site indices:
* graph_branching_indices: are the branching indices according
to the graph-based definition. They might not be part of the molecule.
* binding_indices: are the indices of the sites between
the branching index and metal
* original_indices: complete original set of indices that has been selected
for this building blocks
.. note::
The coordinates in the molecule object are not the ones directly
extracted from the MOF. They are the coordinates of sites unwrapped
to ensure that there are no "broken molecules" .
To obtain the "original" coordinates, use the `_coordinates` attribute.
.. note:: Dummy molecules
In dummy molecules the binding and branching sites are replaces by
dummy atoms (noble gas). They also have special properties that indicate
the original species.
Examples:
>>> # visualize the molecule
>>> sbu_object.show_molecule()
>>> # search pubchem for the molecule
>>> sbu_object.search_pubchem()
"""
def __init__(
self,
molecule: Molecule,
molecule_graph: MoleculeGraph,
graph_branching_indices: Collection[int],
binding_indices: Collection[int],
molecule_original_indices_mapping: Optional[Dict[int, List[int]]] = None,
dummy_molecule: Optional[Molecule] = None,
dummy_molecule_graph: Optional[MoleculeGraph] = None,
dummy_molecule_indices_mapping: Optional[Dict[int, List[int]]] = None,
dummy_branching_indices: Optional[Collection[int]] = None,
):
"""Initialize a secondary building block.
In practice, you won't use this constructor directly.
Args:
molecule (Molecule): Pymatgen molecule object.
molecule_graph (MoleculeGraph): Pymatgen molecule graph object.
graph_branching_indices (Collection[int]): Branching indices
in the original structure.
binding_indices (Collection[int]): Binding indices in the original structure.
molecule_original_indices_mapping (Optional[Dict[int, List[int]]], optional):
Mapping from molecule indices to original indices. Defaults to None.
dummy_molecule (Optional[Molecule], optional): Dummy molecule. Defaults to None.
dummy_molecule_graph (Optional[MoleculeGraph], optional): Dummy molecule graph.
Defaults to None.
dummy_molecule_indices_mapping (Optional[Dict[int, List[int]]], optional):
Dummy molecule indices mapping. Defaults to None.
dummy_branching_indices (Optional[Collection[int]], optional):
Dummy branching indices. Defaults to None.
"""
self.molecule = molecule
self._mapping = molecule_original_indices_mapping
self._indices = sum(list(molecule_original_indices_mapping.values()), [])
self._original_indices = self._indices
self.molecule_graph = molecule_graph
self._original_graph_branching_indices = graph_branching_indices
self._original_binding_indices = binding_indices
self._dummy_molecule = dummy_molecule
self._dummy_molecule_graph = dummy_molecule_graph
self._dummy_molecule_indices_mapping = dummy_molecule_indices_mapping
self._dummy_branching_indices = dummy_branching_indices
self.mapping_from_original_indices = defaultdict(list)
if molecule_original_indices_mapping is None:
for ori_index, index in zip(self._indices, range(len(molecule))):
self.mapping_from_original_indices[ori_index].append(index)
else:
for k, v in molecule_original_indices_mapping.items():
for i in v:
self.mapping_from_original_indices[i].append(k)
if dummy_molecule:
self.dummy_mapping_from_original_indices = defaultdict(list)
if dummy_molecule_indices_mapping is None:
for ori_index, index in zip(self._indices, range(len(dummy_molecule))):
self.dummy_mapping_from_original_indices[ori_index].append(index)
else:
for k, v in dummy_molecule_indices_mapping.items():
for i in v:
self.dummy_mapping_from_original_indices[i].append(k)
self.mapping_to_original_indices = {}
for key, value in self.mapping_from_original_indices.items():
for v in value:
self.mapping_to_original_indices[v] = key
@property
def center(self):
return self.molecule.center_of_mass
[docs] def search_pubchem(self, listkey_counts: int = 10, **kwargs) -> Tuple[List[str], bool]:
"""Search for a molecule in pubchem # noqa: DAR401
Second element of return tuple is true if there was an identity match
Args:
listkey_counts (int): Number of list keys to return
(relevant for substructure search).
Defaults to 10.
kwargs: Additional arguments to pass to PubChem.search
Returns:
Tuple[List[str], bool]: List of pubchem ids and whether there was an identity match
"""
try:
matches = pcp.get_compounds(
self.smiles, namespace="smiles", searchtype="fastidentity", **kwargs
)
if matches:
return (
matches,
True,
)
else:
raise ValueError("No matches found")
except Exception:
logger.warning(
f"Could not find {self.smiles} in pubchem, \
now performing substructure search"
)
# we use `fastsubstructure` as it fixes
# https://github.com/kjappelbaum/moffragmentor/issues/63
res = pcp.get_compounds(
self.smiles,
namespace="smiles",
searchtype="fastsubstructure",
listkey_counts=listkey_counts,
**kwargs,
)
smiles = [r.canonical_smiles for r in res]
return mcs_rank(self.smiles, smiles, additional_attributes=res), False
[docs] def get_neighbor_indices(self, site: int) -> List[int]:
"""Get list of indices of neighboring sites"""
return [site.index for site in self.molecule_graph.get_connected_sites(site)]
# make this function so we can have different flavors
def get_indices(self):
return self._indices
@property
def is_edge(self):
return len(self.branching_coords) == 2
def __len__(self):
"""Return the number of atoms in the molecule."""
return len(self.molecule)
def __str__(self):
"""Return the SMILES string for the molecule."""
return self.smiles
def dump(self, path):
pickle_dump(self, path)
def _get_nx_graph(self):
return nx.Graph(self.molecule_graph.graph.to_undirected())
@cached_property
def metal_indices(self) -> List[int]:
return [i for i, species in enumerate(self.molecule.species) if species.is_metal]
@cached_property
def nx_graph(self):
return self._get_nx_graph()
@property
def composition(self):
return self.molecule.composition.alphabetical_formula
@property
def cart_coords(self):
return self.molecule.cart_coords
@cached_property
def mol_with_coords(self):
mol = self.molecule.copy()
mol.coords = self.cart_coords
sites = mol.sites
return Molecule.from_sites(sites)
@property
def coordination(self):
return len(self.original_graph_branching_indices)
@property
def original_graph_branching_indices(self):
return self._original_graph_branching_indices
@property
def graph_branching_indices(self):
indices = []
if self._dummy_branching_indices is None:
for i in self.original_graph_branching_indices:
for index in self.mapping_from_original_indices[i]:
indices.append(index)
else:
for i in self._dummy_branching_indices:
for index in self.dummy_mapping_from_original_indices[i]:
indices.append(index)
return indices
@cached_property
def weisfeiler_lehman_graph_hash(self):
return nx.weisfeiler_lehman_graph_hash(self.molecule_graph.graph, node_attr="specie")
@property
def molar_mass(self):
return self.molecule.composition.weight
@cached_property
def hash(self) -> str:
"""Return hash.
The hash is a combination of Weisfeiler-Lehman graph hash and center.
Returns:
str: Hash.
"""
wl_hash = self.weisfeiler_lehman_graph_hash
center = self.molecule.cart_coords.mean(axis=0)
return f"{wl_hash}-{center[0]:.2f}-{center[1]:.2f}-{center[2]:.2f}"
def __eq__(self, other: "SBU") -> bool:
"""Check if two molecules are equal.
Based on the Weisfeiler-Lehman graph hash and center of mass.
Args:
other (SBU): SBU to compare to.
Returns:
bool: True if equal, False otherwise.
"""
if hash(self) != hash(other):
return False
return True
@cached_property
def branching_coords(self):
return (
self.cart_coords[self.graph_branching_indices]
if self._dummy_branching_indices is None
else self._dummy_molecule.cart_coords[self.graph_branching_indices]
)
@property
def original_binding_indices(self):
return self._original_binding_indices
@cached_property
def binding_indices(self):
indices = []
for i in self.original_binding_indices:
for index in self.mapping_from_original_indices[i]:
indices.append(index)
return indices
@cached_property
def rdkit_mol(self):
return obmol_to_rdkit_mol(self.openbabel_mol)
@cached_property
def openbabel_mol(self):
return self.get_openbabel_mol()
def get_openbabel_mol(self):
from openbabel import pybel as pb
a = BabelMolAdaptor(self.molecule)
pm = pb.Molecule(a.openbabel_mol)
return pm
def show_molecule(self):
import nglview
return nglview.show_pymatgen(self.molecule)
def show_connecting_structure(self):
import nglview
return nglview.show_pymatgen(self._get_connected_sites_structure())
def show_binding_structure(self):
import nglview
return nglview.show_pymatgen(self._get_binding_sites_structure())
def to(self, fmt: str, filename: str):
return self.molecule.to(fmt, filename)
@cached_property
def smiles(self) -> str:
"""Return canonical SMILES.
Use openbabel to compute the SMILES, but then get the
canonical version with RDKit as we observed sometimes the same
molecule ends up as different canonical SMILES for openbabel.
If RDKit cannot make a canonical SMILES (can happen with organometallics)
we simply use the openbabel version.
Returns:
str: Canonical SMILES
"""
mol = self.openbabel_mol
smiles = mol.write("can").strip()
try:
canonical = Chem.CanonSmiles(smiles)
return canonical
except Exception:
return smiles
def _get_boxed_structure(self, mol=None) -> Structure:
if mol is None:
mol = self.molecule
max_size = _get_max_sep(mol.cart_coords)
structure = mol.get_boxed_structure(
max_size + 0.1 * max_size,
max_size + 0.1 * max_size,
max_size + 0.1 * max_size,
reorder=False,
)
return structure
def _get_binding_sites_structure(self) -> Structure:
sites = []
s = self._get_boxed_structure()
for i in self.binding_indices:
sites.append(s[i])
return Structure.from_sites(sites)
def _get_branching_sites_structure(self) -> Structure:
new_sites = []
s = self._get_boxed_structure(self._dummy_molecule)
for i, site in enumerate(s):
if set(self._dummy_molecule_indices_mapping[i]) & self._dummy_branching_indices:
if "original_species" in site.properties and site.properties["original_species"]:
species = site.properties["original_species"]
else:
species = site.species
new_site = PeriodicSite(species, site.coords, site.lattice)
new_sites.append(new_site)
return Structure.from_sites(new_sites)
@cached_property
def descriptors(self):
return self._get_descriptors()
def _get_max_sep(coordinates):
if len(coordinates) > 1:
distances = pdist(coordinates)
return np.max(distances)
return 5