Skip to content

Commit

Permalink
add README for major components; add eval scripts for dst and word_dst
Browse files Browse the repository at this point in the history
  • Loading branch information
zz-jacob committed Jun 11, 2019
1 parent d4cea08 commit 4dbcd41
Show file tree
Hide file tree
Showing 14 changed files with 5,018 additions and 82 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ data/multiwoz/annotated_qr_pairs_30k.txt
data/multiwoz/annoted_bio_all_20k.txt
data/multiwoz/annotated_user_utts_v2.txt
data/multiwoz/annotated_user_utts_v2_20k.txt
data/multiwoz/test.json

# private test script
private_test.py
Expand Down
20 changes: 20 additions & 0 deletions convlab/modules/dst/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# Dialog State Tracking

In the pipeline task-oriented dialog framework, the DST module encodes
the dialog history into a pre-defined state representation, which can be
either structured or unstructured.
In each turn, it takes as input the dialog act of user utterances, and updates
its internal state variable.

This directory contains the interface definition of dialog state
tracking module and some act-level DST module implementations.

## Interface

The interfaces of DST are defined in state_tracker.Tracker, including:

- **update** takes as input the new observation in each turn, and update
the internal state variable of DST component. The new observation is the
dialog act of user utterance, which may be the output of NLU module.

- **reset** reset the internal state variable for a new dialog session.
166 changes: 126 additions & 40 deletions convlab/modules/dst/multiwoz/dst_util.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import json
import re
from difflib import SequenceMatcher

Expand Down Expand Up @@ -111,6 +110,33 @@ def init_state():
def str_similar(a, b):
return SequenceMatcher(None, a, b).ratio()

def _log(info):
with open('fuzzy_recognition.log', 'a+') as f:
f.write('{}\n'.format(info))
f.close()

def minDistance(word1, word2):
"""The minimum edit distance between word 1 and 2."""
if not word1:
return len(word2 or '') or 0
if not word2:
return len(word1 or '') or 0
size1 = len(word1)
size2 = len(word2)
tmp = list(range(size2 + 1))
value = None
for i in range(size1):
tmp[0] = i + 1
last = i
for j in range(size2):
if word1[i] == word2[j]:
value = last
else:
value = 1 + min(last, tmp[j], tmp[j + 1])
last = tmp[j+1]
tmp[j+1] = value
return value

def normalize_value(value_set, domain, slot, value):
"""
Normalized the value produced by NLU module to map it to the ontology value space.
Expand All @@ -119,57 +145,117 @@ def normalize_value(value_set, domain, slot, value):
domain (str): The domain of the slot-value pairs.
slot (str): The slot of the value.
value (str): The raw value detected by NLU module.
Returns:
value (str): The normalized value, which fits with the domain ontology.
"""
slot = slot.lower()
value = value.lower()
value = ' '.join(value.split())
try:
assert domain in value_set
except:
raise Exception('domain <{}> not found in value set'.format(domain))
if slot not in value_set[domain]:
print(value_set[domain].keys())
raise Exception('slot <{}> not found in db_values[{}]'.format(slot, domain))
value_list = value_set[domain][slot]
# for time type slots
if slot in ['leaveat', 'arriveby']:
mat = re.search(r"(\d{1,2}:\d{1,2})", value)
if mat is not None:
value = mat.groups()[0]
else:
value = "00:00" # TODO: check default value
return value
# for entrance fee
if slot == 'entrance fee':
if 'free' in value:
return 'free'
mat = re.search(r"(\d{1}.\d{1,2}) pounds", value)
if mat is not None:
value = mat.groups()[0]
return value
mat = re.search(r"(\d{1}) pounds", value)
if mat is not None:
value = mat.groups()[0]
return value
return '5 pounds' # TODO: check deafult value
# for ideal condition
elif value in value_list:
# exact match or containing match
v = _match_or_contain(value, value_list)
if v is not None:
return v
# some transfomations
cand_values = _transform_value(value)
for cv in cand_values:
v = _match_or_contain(cv, value_list)
if v is not None:
return v
# special value matching
v = special_match(domain, slot, value)
if v is not None:
return v
_log('Failed: domain {} slot {} value {}, raw value returned.'.format(domain, slot, value))
return value

def _transform_value(value):
cand_list = []
# a 's -> a's
if " 's" in value:
cand_list.append(value.replace(" 's", "'s"))
# a - b -> a-b
if " - " in value:
cand_list.append(value.replace(" - ", "-"))
# center <-> centre
if value == 'center':
cand_list.append('centre')
elif value == 'centre':
cand_list.append('center')
# the + value
if not value.startswith('the '):
cand_list.append('the ' + value)
return cand_list

def _match_or_contain(value, value_list):
"""match value by exact match or containing"""
if value in value_list:
return value
# for fuzzy value recognition
else:
best_value = value
best_score = -1
for v1 in value_list:
score = str_similar(value, v1)
if score > best_score:
best_score = score
best_value = v1
with open('fuzzy_recognition.log', 'a+') as f:
f.write('{} -> {}\n'.format(value, best_value))
return best_value
for v in value_list:
if v in value or value in v:
return v
## fuzzy match, when len(value) is large and distance(v1, v2) is small
for v in value_list:
d = minDistance(value, v)
if (d <= 2 and len(value) >= 10) or (d <= 3 and len(value) >= 15):
return v
return None

def special_match(domain, slot, value):
"""special slot fuzzy matching"""
matched_result = None
if slot == 'arriveby' or slot == 'leaveat':
matched_result = _match_time(value)
elif slot == 'price' or slot == 'entrance fee':
matched_result = _match_pound_price(value)
elif slot == 'trainid':
matched_result = _match_trainid(value)
elif slot == 'duration':
matched_result = _match_duration(value)
return matched_result

def _match_time(value):
"""Return the time (leaveby, arriveat) in value, None if no time in value."""
mat = re.search(r"(\d{1,2}:\d{1,2})", value)
if mat is not None and len(mat.groups()) > 0:
return mat.groups()[0]
return None

def _match_trainid(value):
"""Return the trainID in value, None if no trainID."""
mat = re.search(r"TR(\d{4})", value)
if mat is not None and len(mat.groups()) > 0:
return mat.groups()[0]
return None

def _match_pound_price(value):
"""Return the price with pounds in value, None if no trainID."""
mat = re.search(r"(\d{1,2},\d{1,2} pounds)", value)
if mat is not None and len(mat.groups()) > 0:
return mat.groups()[0]
mat = re.search(r"(\d{1,2} pounds)", value)
if mat is not None and len(mat.groups()) > 0:
return mat.groups()[0]
if "1 pound" in value.lower():
return '1 pound'
if 'free' in value:
return 'free'
return None

def _match_duration(value):
"""Return the durations (by minute) in value, None if no trainID."""
mat = re.search(r"(\d{1,2} minutes)", value)
if mat is not None and len(mat.groups()) > 0:
return mat.groups()[0]
return None

if __name__ == "__main__":
value_set = json.load(open('../../../data/multiwoz/db/db_values.json'))
print(normalize_value(value_set, 'restaurant', 'address', 'regent street city center'))
# value_set = json.load(open('../../../data/multiwoz/db/db_values.json'))
# print(normalize_value(value_set, 'restaurant', 'address', 'regent street city center'))
print(minDistance("museum of archaeology and anthropology", "museum of archaelogy and anthropology"))
Loading

0 comments on commit 4dbcd41

Please sign in to comment.