Skip to content

graphchem.preprocessing.MoleculeEncoder

Bases: object

Source code in graphchem/preprocessing/features.py
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
class MoleculeEncoder(object):

    def __init__(self, smiles: List[str]):
        """ MoleculeEncoder object: given a list of SMILES strings, construct/
        train integer tokenizers to tokenize atom/bond features, parse
        molecule connectivity

        Args:
            smiles (List[str]): SMILES strings to consider for encoder
                construction
        """

        mols = [rdkit.Chem.MolFromSmiles(smi) for smi in smiles]
        for idx, mol in enumerate(mols):
            if mol is None:
                raise ValueError(f'Unable to parse SMILES: {smiles[idx]}')

        atoms = np.concatenate([mol.GetAtoms() for mol in mols])
        atom_reprs = [atom_to_str(atom) for atom in atoms]
        bond_reprs = np.concatenate(
            [[bond_to_str(bond) for bond in atom.GetBonds()] for atom in atoms]
        )

        self._atom_tokenizer = Tokenizer()
        for rep in atom_reprs:
            self._atom_tokenizer(rep)
        self._atom_tokenizer.train = False

        self._bond_tokenizer = Tokenizer()
        for rep in bond_reprs:
            self._bond_tokenizer(rep)
        self._atom_tokenizer.train = False

    @property
    def vocab_sizes(self) -> Tuple[int]:
        """ total vocabulary/dictionary sizes for tokenizers, in form (atom
        vocab size, bond vocab size)

        Returns:
            Tuple[int]: (atom vocab size, bond vocab size)
        """

        return (self._atom_tokenizer.vocab_size,
                self._bond_tokenizer.vocab_size)

    def encode_many(self, smiles: List[str]) -> List[Tuple['torch.tensor']]:
        """ batch encoding of SMILES strings

        Args:
            smiles (List[str]): list of SMILES strings

        Returns:
            List[Tuple['torch.tensor']]: List of: (atom encoding, bond
                encoding, connectivity matrix) for each compound
        """

        encoded_compounds = []
        for smi in smiles:
            encoded_compounds.append(self.encode(smi))
        return encoded_compounds

    def encode(self, smiles: str) -> Tuple['torch.tensor']:
        """ encode a molecule using its SMILES string

        Args:
            smiles (str): molecule's SMILES string

        Returns:
            Tuple['torch.tensor']: (encoded atom features, encoded bond
                features, molecule connectivity matrix)
        """

        mol = rdkit.Chem.MolFromSmiles(smiles)
        if mol is None:
            raise ValueError(f'Unable to parse SMILES: {smiles}')
        atoms = mol.GetAtoms()

        atom_reprs = [atom_to_str(atom) for atom in atoms]
        enc_atoms = torch.tensor([self._atom_tokenizer(atom)
                                  for atom in atom_reprs]).type(torch.int)

        bond_reprs = np.concatenate(
            [[bond_to_str(bond) for bond in atom.GetBonds()] for atom in atoms]
        )
        enc_bonds = torch.tensor([self._bond_tokenizer(bond)
                                  for bond in bond_reprs]).type(torch.int)

        connectivity = np.zeros((2, 2 * mol.GetNumBonds()))
        bond_index = 0
        for atom in atoms:
            start_idx = atom.GetIdx()
            for bond in atom.GetBonds():
                reverse = bond.GetBeginAtomIdx() != start_idx
                if not reverse:
                    connectivity[0, bond_index] = bond.GetBeginAtomIdx()
                    connectivity[1, bond_index] = bond.GetEndAtomIdx()
                else:
                    connectivity[0, bond_index] = bond.GetEndAtomIdx()
                    connectivity[1, bond_index] = bond.GetBeginAtomIdx()
                bond_index += 1
        connectivity = torch.from_numpy(connectivity).type(torch.long)

        return (enc_atoms, enc_bonds, connectivity)

    def save(self, filename: str) -> None:
        """ save the encoder to a file

        Args:
            filename (str): new filename/path for model

        Returns:
            None
        """

        with open(filename, 'wb') as outp:
            pickle.dump(self, outp, pickle.HIGHEST_PROTOCOL)

    def load(self, filename: str) -> None:
        """ load an encoder from file (current encoder attributes, including
        pre-trained tokenizers, are overwritten)

        Args:
            filename (str): filename/path of model

        Returns:
            None
        """

        with open(filename, 'rb') as inp:
            self.__dict__.update(pickle.loads(inp).__dict__)

vocab_sizes: Tuple[int] property

total vocabulary/dictionary sizes for tokenizers, in form (atom vocab size, bond vocab size)

Returns:

Type Description
Tuple[int]

Tuple[int]: (atom vocab size, bond vocab size)

__init__(smiles)

MoleculeEncoder object: given a list of SMILES strings, construct/ train integer tokenizers to tokenize atom/bond features, parse molecule connectivity

Parameters:

Name Type Description Default
smiles List[str]

SMILES strings to consider for encoder construction

required
Source code in graphchem/preprocessing/features.py
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
def __init__(self, smiles: List[str]):
    """ MoleculeEncoder object: given a list of SMILES strings, construct/
    train integer tokenizers to tokenize atom/bond features, parse
    molecule connectivity

    Args:
        smiles (List[str]): SMILES strings to consider for encoder
            construction
    """

    mols = [rdkit.Chem.MolFromSmiles(smi) for smi in smiles]
    for idx, mol in enumerate(mols):
        if mol is None:
            raise ValueError(f'Unable to parse SMILES: {smiles[idx]}')

    atoms = np.concatenate([mol.GetAtoms() for mol in mols])
    atom_reprs = [atom_to_str(atom) for atom in atoms]
    bond_reprs = np.concatenate(
        [[bond_to_str(bond) for bond in atom.GetBonds()] for atom in atoms]
    )

    self._atom_tokenizer = Tokenizer()
    for rep in atom_reprs:
        self._atom_tokenizer(rep)
    self._atom_tokenizer.train = False

    self._bond_tokenizer = Tokenizer()
    for rep in bond_reprs:
        self._bond_tokenizer(rep)
    self._atom_tokenizer.train = False

encode(smiles)

encode a molecule using its SMILES string

Parameters:

Name Type Description Default
smiles str

molecule's SMILES string

required

Returns:

Type Description
Tuple[torch.tensor]

Tuple['torch.tensor']: (encoded atom features, encoded bond features, molecule connectivity matrix)

Source code in graphchem/preprocessing/features.py
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
def encode(self, smiles: str) -> Tuple['torch.tensor']:
    """ encode a molecule using its SMILES string

    Args:
        smiles (str): molecule's SMILES string

    Returns:
        Tuple['torch.tensor']: (encoded atom features, encoded bond
            features, molecule connectivity matrix)
    """

    mol = rdkit.Chem.MolFromSmiles(smiles)
    if mol is None:
        raise ValueError(f'Unable to parse SMILES: {smiles}')
    atoms = mol.GetAtoms()

    atom_reprs = [atom_to_str(atom) for atom in atoms]
    enc_atoms = torch.tensor([self._atom_tokenizer(atom)
                              for atom in atom_reprs]).type(torch.int)

    bond_reprs = np.concatenate(
        [[bond_to_str(bond) for bond in atom.GetBonds()] for atom in atoms]
    )
    enc_bonds = torch.tensor([self._bond_tokenizer(bond)
                              for bond in bond_reprs]).type(torch.int)

    connectivity = np.zeros((2, 2 * mol.GetNumBonds()))
    bond_index = 0
    for atom in atoms:
        start_idx = atom.GetIdx()
        for bond in atom.GetBonds():
            reverse = bond.GetBeginAtomIdx() != start_idx
            if not reverse:
                connectivity[0, bond_index] = bond.GetBeginAtomIdx()
                connectivity[1, bond_index] = bond.GetEndAtomIdx()
            else:
                connectivity[0, bond_index] = bond.GetEndAtomIdx()
                connectivity[1, bond_index] = bond.GetBeginAtomIdx()
            bond_index += 1
    connectivity = torch.from_numpy(connectivity).type(torch.long)

    return (enc_atoms, enc_bonds, connectivity)

encode_many(smiles)

batch encoding of SMILES strings

Parameters:

Name Type Description Default
smiles List[str]

list of SMILES strings

required

Returns:

Type Description
List[Tuple[torch.tensor]]

List[Tuple['torch.tensor']]: List of: (atom encoding, bond encoding, connectivity matrix) for each compound

Source code in graphchem/preprocessing/features.py
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
def encode_many(self, smiles: List[str]) -> List[Tuple['torch.tensor']]:
    """ batch encoding of SMILES strings

    Args:
        smiles (List[str]): list of SMILES strings

    Returns:
        List[Tuple['torch.tensor']]: List of: (atom encoding, bond
            encoding, connectivity matrix) for each compound
    """

    encoded_compounds = []
    for smi in smiles:
        encoded_compounds.append(self.encode(smi))
    return encoded_compounds

load(filename)

load an encoder from file (current encoder attributes, including pre-trained tokenizers, are overwritten)

Parameters:

Name Type Description Default
filename str

filename/path of model

required

Returns:

Type Description
None

None

Source code in graphchem/preprocessing/features.py
241
242
243
244
245
246
247
248
249
250
251
252
253
def load(self, filename: str) -> None:
    """ load an encoder from file (current encoder attributes, including
    pre-trained tokenizers, are overwritten)

    Args:
        filename (str): filename/path of model

    Returns:
        None
    """

    with open(filename, 'rb') as inp:
        self.__dict__.update(pickle.loads(inp).__dict__)

save(filename)

save the encoder to a file

Parameters:

Name Type Description Default
filename str

new filename/path for model

required

Returns:

Type Description
None

None

Source code in graphchem/preprocessing/features.py
228
229
230
231
232
233
234
235
236
237
238
239
def save(self, filename: str) -> None:
    """ save the encoder to a file

    Args:
        filename (str): new filename/path for model

    Returns:
        None
    """

    with open(filename, 'wb') as outp:
        pickle.dump(self, outp, pickle.HIGHEST_PROTOCOL)