UTTerance UTilities for dialogue system. This package provides some general utils when processing chatbot utterance data.
To create a pipe for BERT preprocessing, please take a look at BERT.
$ pip install uttut
Let's create a Pipe to preprocess a Datum with English utterance.
>>> from uttut.pipeline.pipe import Pipe
>>> p = Pipe()
>>> p.add('IntTokenWithSpace')
>>> p.add('FloatTokenWithSpace')
>>> p.add('MergeWhiteSpaceCharacters')
>>> p.add('StripWhiteSpaceCharacters')
>>> p.add('EngTokenizer') # word-level (ref: BERT)
>>> p.add('AddSosEos', checkpoint='result_of_add_sos_eos')
>>> p.add('Pad', {'maxlen': 5})
>>> p.add(
'Token2Index',
{
'token2index': {
'<sos>': 0, '<eos>': 1, # for AddSosEos
'<unk>': 2, '<pad>': 3, # for Pad
'_int_': 4, # for IntTokenWithSpace
'_float_': 5, # for FloatTokenWithSpace
'I': 6,
'apples': 7,
},
},
)
>>> from uttut.elements import Datum, Entity, Intent
>>> datum = Datum(
utterance='I like apples.',
intents=[Intent(label=1), Intent(label=2)],
entities=[Entity(start=7, end=13, value='apples', label=7)],
)
>>> output_indices, intent_labels, entity_labels, label_aligner, intermediate = p.transform(datum)
>>> output_indices
[0, 6, 2, 7, 1, 3, 3]
>>> intent_labels
[1, 2]
>>> entity_labels
[0, 0, 0, 7, 0, 0, 0]
# intermediate
>>> intermediate.get_from_checkpoint('result_of_add_sos_eos')
["<sos>", "I", "like", "apples", "<eos>"]
# label_aligner
>>> label_aligner.inverse_transform(entity_labels)
[0, 0, 0, 0, 0, 0, 0, 7, 7, 7, 7, 7, 7, 0]
>>> output_sequence, label_aligner, intermediate = p.transform_sequence('I like apples.')
>>> output_sequence
[0, 6, 2, 7, 1, 3, 3]
# label_aligner
>>> label_aligner.transform([0, 0, 0, 0, 0, 0, 0, 7, 7, 7, 7, 7, 7, 0])
[0, 0, 0, 7, 0, 0, 0]
>>> label_aligner.inverse_transform([0, 0, 0, 7, 0, 0, 0])
[0, 0, 0, 0, 0, 0, 0, 7, 7, 7, 7, 7, 7, 0]
# intermediate
>>> intermediate.get_from_checkpoint('result_of_add_sos_eos')
["<sos>", "I", "like", "apples", "<eos>"]
>>> serialized_str = p.serialize()
>>> from uttut.pipeline.pipe import Pipe
>>> p = Pipe.deserialize(serialized_str )