Skip to content

graphchem.preprocessing.Tokenizer

Bases: object

Source code in graphchem/preprocessing/features.py
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
class Tokenizer(object):

    def __init__(self):
        """ Tokenizer object: integer tokenizer for unique atom/bond strings
        """

        self._data = {'unk': 1}
        self.num_classes = 1
        self.train = True
        self.unknown = []

    def __call__(self, item: str) -> int:
        """ Tokenizer(): returns integer value of atom/bond string, otherwise
        'unknown', or 1; if training the tokenizer, add item to vocabulary

        Args:
            item (str): atom/bond string

        Returns:
            int: integer value of atom/bond string
        """

        try:
            return self._data[item]
        except KeyError:
            if self.train:
                self.num_classes += 1
                self._data[item] = self.num_classes
                return self(item)
            else:
                self.unknown.append(item)
                return 1

    @property
    def vocab_size(self) -> int:
        """ vocab_size: returns the total number of unique atom/bond strings
        in the tokenizer's vocabulary

        Returns:
            int: number of strings in vocabulary
        """

        return self.num_classes + 1

vocab_size: int property

vocab_size: returns the total number of unique atom/bond strings in the tokenizer's vocabulary

Returns:

Name Type Description
int int

number of strings in vocabulary

__call__(item)

Tokenizer(): returns integer value of atom/bond string, otherwise 'unknown', or 1; if training the tokenizer, add item to vocabulary

Parameters:

Name Type Description Default
item str

atom/bond string

required

Returns:

Name Type Description
int int

integer value of atom/bond string

Source code in graphchem/preprocessing/features.py
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
def __call__(self, item: str) -> int:
    """ Tokenizer(): returns integer value of atom/bond string, otherwise
    'unknown', or 1; if training the tokenizer, add item to vocabulary

    Args:
        item (str): atom/bond string

    Returns:
        int: integer value of atom/bond string
    """

    try:
        return self._data[item]
    except KeyError:
        if self.train:
            self.num_classes += 1
            self._data[item] = self.num_classes
            return self(item)
        else:
            self.unknown.append(item)
            return 1

__init__()

Tokenizer object: integer tokenizer for unique atom/bond strings

Source code in graphchem/preprocessing/features.py
81
82
83
84
85
86
87
88
def __init__(self):
    """ Tokenizer object: integer tokenizer for unique atom/bond strings
    """

    self._data = {'unk': 1}
    self.num_classes = 1
    self.train = True
    self.unknown = []