graphchem.preprocessing.MoleculeEncoder

Bases: object

Source code in graphchem/preprocessing/features.py

class MoleculeEncoder(object):

    def __init__(self, smiles: List[str]):
        """ MoleculeEncoder object: given a list of SMILES strings, construct/
        train integer tokenizers to tokenize atom/bond features, parse
        molecule connectivity

        Args:
            smiles (List[str]): SMILES strings to consider for encoder
                construction
        """

        mols = [rdkit.Chem.MolFromSmiles(smi) for smi in smiles]
        for idx, mol in enumerate(mols):
            if mol is None:
                raise ValueError(f'Unable to parse SMILES: {smiles[idx]}')

        atoms = np.concatenate([mol.GetAtoms() for mol in mols])
        atom_reprs = [atom_to_str(atom) for atom in atoms]
        bond_reprs = np.concatenate(
            [[bond_to_str(bond) for bond in atom.GetBonds()] for atom in atoms]
        )

        self._atom_tokenizer = Tokenizer()
        for rep in atom_reprs:
            self._atom_tokenizer(rep)
        self._atom_tokenizer.train = False

        self._bond_tokenizer = Tokenizer()
        for rep in bond_reprs:
            self._bond_tokenizer(rep)
        self._atom_tokenizer.train = False

    @property
    def vocab_sizes(self) -> Tuple[int]:
        """ total vocabulary/dictionary sizes for tokenizers, in form (atom
        vocab size, bond vocab size)

        Returns:
            Tuple[int]: (atom vocab size, bond vocab size)
        """

        return (self._atom_tokenizer.vocab_size,
                self._bond_tokenizer.vocab_size)

    def encode_many(self, smiles: List[str]) -> List[Tuple['torch.tensor']]:
        """ batch encoding of SMILES strings

        Args:
            smiles (List[str]): list of SMILES strings

        Returns:
            List[Tuple['torch.tensor']]: List of: (atom encoding, bond
                encoding, connectivity matrix) for each compound
        """

        encoded_compounds = []
        for smi in smiles:
            encoded_compounds.append(self.encode(smi))
        return encoded_compounds

    def encode(self, smiles: str) -> Tuple['torch.tensor']:
        """ encode a molecule using its SMILES string

        Args:
            smiles (str): molecule's SMILES string

        Returns:
            Tuple['torch.tensor']: (encoded atom features, encoded bond
                features, molecule connectivity matrix)
        """

        mol = rdkit.Chem.MolFromSmiles(smiles)
        if mol is None:
            raise ValueError(f'Unable to parse SMILES: {smiles}')
        atoms = mol.GetAtoms()

        atom_reprs = [atom_to_str(atom) for atom in atoms]
        enc_atoms = torch.tensor([self._atom_tokenizer(atom)
                                  for atom in atom_reprs]).type(torch.int)

        bond_reprs = np.concatenate(
            [[bond_to_str(bond) for bond in atom.GetBonds()] for atom in atoms]
        )
        enc_bonds = torch.tensor([self._bond_tokenizer(bond)
                                  for bond in bond_reprs]).type(torch.int)

        connectivity = np.zeros((2, 2 * mol.GetNumBonds()))
        bond_index = 0
        for atom in atoms:
            start_idx = atom.GetIdx()
            for bond in atom.GetBonds():
                reverse = bond.GetBeginAtomIdx() != start_idx
                if not reverse:
                    connectivity[0, bond_index] = bond.GetBeginAtomIdx()
                    connectivity[1, bond_index] = bond.GetEndAtomIdx()
                else:
                    connectivity[0, bond_index] = bond.GetEndAtomIdx()
                    connectivity[1, bond_index] = bond.GetBeginAtomIdx()
                bond_index += 1
        connectivity = torch.from_numpy(connectivity).type(torch.long)

        return (enc_atoms, enc_bonds, connectivity)

    def save(self, filename: str) -> None:
        """ save the encoder to a file

        Args:
            filename (str): new filename/path for model

        Returns:
            None
        """

        with open(filename, 'wb') as outp:
            pickle.dump(self, outp, pickle.HIGHEST_PROTOCOL)

    def load(self, filename: str) -> None:
        """ load an encoder from file (current encoder attributes, including
        pre-trained tokenizers, are overwritten)

        Args:
            filename (str): filename/path of model

        Returns:
            None
        """

        with open(filename, 'rb') as inp:
            self.__dict__.update(pickle.loads(inp).__dict__)

`vocab_sizes: Tuple[int]` `property`

total vocabulary/dictionary sizes for tokenizers, in form (atom vocab size, bond vocab size)

Returns:

Type	Description
`Tuple[int]`	Tuple[int]: (atom vocab size, bond vocab size)

`init(smiles)`

MoleculeEncoder object: given a list of SMILES strings, construct/ train integer tokenizers to tokenize atom/bond features, parse molecule connectivity

Parameters:

Name	Type	Description	Default
`smiles`	`List[str]`	SMILES strings to consider for encoder construction	required

Source code in graphchem/preprocessing/features.py

def __init__(self, smiles: List[str]):
    """ MoleculeEncoder object: given a list of SMILES strings, construct/
    train integer tokenizers to tokenize atom/bond features, parse
    molecule connectivity

    Args:
        smiles (List[str]): SMILES strings to consider for encoder
            construction
    """

    mols = [rdkit.Chem.MolFromSmiles(smi) for smi in smiles]
    for idx, mol in enumerate(mols):
        if mol is None:
            raise ValueError(f'Unable to parse SMILES: {smiles[idx]}')

    atoms = np.concatenate([mol.GetAtoms() for mol in mols])
    atom_reprs = [atom_to_str(atom) for atom in atoms]
    bond_reprs = np.concatenate(
        [[bond_to_str(bond) for bond in atom.GetBonds()] for atom in atoms]
    )

    self._atom_tokenizer = Tokenizer()
    for rep in atom_reprs:
        self._atom_tokenizer(rep)
    self._atom_tokenizer.train = False

    self._bond_tokenizer = Tokenizer()
    for rep in bond_reprs:
        self._bond_tokenizer(rep)
    self._atom_tokenizer.train = False

`encode(smiles)`

encode a molecule using its SMILES string

Parameters:

Name	Type	Description	Default
`smiles`	`str`	molecule's SMILES string	required

Returns:

Type	Description
`Tuple[torch.tensor]`	Tuple['torch.tensor']: (encoded atom features, encoded bond features, molecule connectivity matrix)

Source code in graphchem/preprocessing/features.py

def encode(self, smiles: str) -> Tuple['torch.tensor']:
    """ encode a molecule using its SMILES string

    Args:
        smiles (str): molecule's SMILES string

    Returns:
        Tuple['torch.tensor']: (encoded atom features, encoded bond
            features, molecule connectivity matrix)
    """

    mol = rdkit.Chem.MolFromSmiles(smiles)
    if mol is None:
        raise ValueError(f'Unable to parse SMILES: {smiles}')
    atoms = mol.GetAtoms()

    atom_reprs = [atom_to_str(atom) for atom in atoms]
    enc_atoms = torch.tensor([self._atom_tokenizer(atom)
                              for atom in atom_reprs]).type(torch.int)

    bond_reprs = np.concatenate(
        [[bond_to_str(bond) for bond in atom.GetBonds()] for atom in atoms]
    )
    enc_bonds = torch.tensor([self._bond_tokenizer(bond)
                              for bond in bond_reprs]).type(torch.int)

    connectivity = np.zeros((2, 2 * mol.GetNumBonds()))
    bond_index = 0
    for atom in atoms:
        start_idx = atom.GetIdx()
        for bond in atom.GetBonds():
            reverse = bond.GetBeginAtomIdx() != start_idx
            if not reverse:
                connectivity[0, bond_index] = bond.GetBeginAtomIdx()
                connectivity[1, bond_index] = bond.GetEndAtomIdx()
            else:
                connectivity[0, bond_index] = bond.GetEndAtomIdx()
                connectivity[1, bond_index] = bond.GetBeginAtomIdx()
            bond_index += 1
    connectivity = torch.from_numpy(connectivity).type(torch.long)

    return (enc_atoms, enc_bonds, connectivity)

`encode_many(smiles)`

batch encoding of SMILES strings

Parameters:

Name	Type	Description	Default
`smiles`	`List[str]`	list of SMILES strings	required

Returns:

Type	Description
`List[Tuple[torch.tensor]]`	List[Tuple['torch.tensor']]: List of: (atom encoding, bond encoding, connectivity matrix) for each compound

Source code in graphchem/preprocessing/features.py

def encode_many(self, smiles: List[str]) -> List[Tuple['torch.tensor']]:
    """ batch encoding of SMILES strings

    Args:
        smiles (List[str]): list of SMILES strings

    Returns:
        List[Tuple['torch.tensor']]: List of: (atom encoding, bond
            encoding, connectivity matrix) for each compound
    """

    encoded_compounds = []
    for smi in smiles:
        encoded_compounds.append(self.encode(smi))
    return encoded_compounds

`load(filename)`

load an encoder from file (current encoder attributes, including pre-trained tokenizers, are overwritten)

Parameters:

Name	Type	Description	Default
`filename`	`str`	filename/path of model	required

Returns:

Type	Description
`None`	None

Source code in graphchem/preprocessing/features.py

def load(self, filename: str) -> None:
    """ load an encoder from file (current encoder attributes, including
    pre-trained tokenizers, are overwritten)

    Args:
        filename (str): filename/path of model

    Returns:
        None
    """

    with open(filename, 'rb') as inp:
        self.__dict__.update(pickle.loads(inp).__dict__)

`save(filename)`

save the encoder to a file

Parameters:

Name	Type	Description	Default
`filename`	`str`	new filename/path for model	required

Returns:

Type	Description
`None`	None

Source code in graphchem/preprocessing/features.py

def save(self, filename: str) -> None:
    """ save the encoder to a file

    Args:
        filename (str): new filename/path for model

    Returns:
        None
    """

    with open(filename, 'wb') as outp:
        pickle.dump(self, outp, pickle.HIGHEST_PROTOCOL)

graphchem.preprocessing.MoleculeEncoder

vocab_sizes: Tuple[int] property

__init__(smiles)

encode(smiles)

encode_many(smiles)

load(filename)

save(filename)

`vocab_sizes: Tuple[int]` `property`

`init(smiles)`

`encode(smiles)`

`encode_many(smiles)`

`load(filename)`

`save(filename)`