Source code for rdkit2ase.substructure

import ase
from ase.build import separate
from rdkit import Chem

from rdkit2ase.ase2x import ase2rdkit
from rdkit2ase.utils import find_connected_components



[docs]
def match_substructure(
    atoms: ase.Atoms,
    smiles: str | None = None,
    smarts: str | None = None,
    mol: Chem.Mol | None = None,
    fragment: ase.Atoms | None = None,
    **kwargs,
) -> tuple[tuple[int, ...]]:
    """
    Find all matches of a substructure pattern in a given ASE Atoms object.

    Parameters
    ----------
    atoms : ase.Atoms
        The molecule or structure in which to search for substructure matches.
    smiles : str, optional
        A SMILES string representing the substructure pattern to match.
    smarts : str, optional
        A SMARTS string representing the substructure pattern to match.
    mol : Chem.Mol, optional
        An RDKit Mol object representing the substructure pattern to match.
    fragment : ase.Atoms, optional
        An ASE Atoms object representing the substructure pattern to match.
        If provided, it will be converted to an RDKit Mol object for matching.
    **kwargs
        Additional keyword arguments passed to `ase2rdkit`.

    Returns
    -------
    tuple of tuple of int
        A tuple of atom index tuples, each corresponding to one match of the pattern.
    """
    pattern = None
    if smiles is not None:
        pattern = Chem.MolFromSmiles(smiles)
        pattern = Chem.AddHs(pattern)  # Ensure hydrogens are added for matching
    if smarts is not None:
        if pattern is not None:
            raise ValueError("Can only specify one pattern")
        pattern = Chem.MolFromSmarts(smarts)
    if mol is not None:
        if pattern is not None:
            raise ValueError("Can only specify one pattern")
        pattern = mol
    if fragment is not None:
        if pattern is not None:
            raise ValueError("Can only specify one pattern")
        pattern = ase2rdkit(fragment, **kwargs)
    if pattern is None:
        raise ValueError("Must specify a pattern")

    Chem.SanitizeMol(pattern)

    mol = ase2rdkit(atoms, **kwargs)
    matches = mol.GetSubstructMatches(pattern)
    return matches




[docs]
def get_substructures(
    atoms: ase.Atoms,
    **kwargs,
) -> list[ase.Atoms]:
    """
    Extract all matched substructures from an ASE Atoms object.

    Parameters
    ----------
    atoms : ase.Atoms
        The structure to search in.
    smarts : str, optional
        A SMARTS string to match substructures.
    smiles : str, optional
        A SMILES string to match substructures.
    mol : Chem.Mol, optional
        An RDKit Mol object to match substructures.
    fragment : ase.Atoms, optional
        A specific ASE Atoms object to match against the structure.
    **kwargs
        Additional keyword arguments passed to `match_substructure`.

    Returns
    -------
    list of ase.Atoms
        List of substructure fragments matching the pattern.
    """
    return [atoms[match] for match in match_substructure(atoms, **kwargs)]




[docs]
def iter_fragments(atoms: ase.Atoms) -> list[ase.Atoms]:
    """
    Iterate over connected molecular fragments in an ASE Atoms object.

    If a 'connectivity' field is present in `atoms.info`, it will be used
    to determine fragments. Otherwise, `ase.build.separate` will be used.

    Parameters
    ----------
    atoms : ase.Atoms
        A structure that may contain one or more molecular fragments.

    Yields
    ------
    ase.Atoms
        Each connected component (fragment) in the input structure.
    """
    if "connectivity" in atoms.info:
        # connectivity is a list of tuples (i, j, bond_type)
        connectivity = atoms.info["connectivity"]
        for component in find_connected_components(connectivity):
            yield atoms[list(component)]
    else:
        for molecule in separate(atoms):
            yield molecule