Skip to content

Commit

Permalink
Added training file for English contractions
Browse files Browse the repository at this point in the history
lemma(isn't) -> be
lemma('m) -> be
  • Loading branch information
Alexandre Point committed Feb 4, 2015
1 parent bff3bc5 commit e479b48
Show file tree
Hide file tree
Showing 5 changed files with 82 additions and 19 deletions.
26 changes: 26 additions & 0 deletions SourceFileBuilder/Input/english-contractions.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
// Add missing lemmas in this file to enrich the lemmatizer.
don't do
doesn't do
didn't did
won't will
shan't shall
can't can
couldn't could
wouldn't would
shouldn't should
mustn't must
mightn't might
oughtn't ought
needn't need
aren't are
isn't be
wasn't be
weren't be
haven't have
hasn't have
hadn't have
's 's
've have
'm be
're be
'll will
12 changes: 11 additions & 1 deletion SourceFileBuilder/Input/english-lemma-enricher.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,14 @@ stove stave
vacuumed vacuum
whiled while
wigged wig
zoned zone
zoned zone
ballsed balls
shore shore
feces feces
ideating ideate
coder code
zoning zone
bing bing
sped speed
lied lie
jihad jihad
34 changes: 18 additions & 16 deletions SourceFileBuilder/Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ class Program
{
static void Main(string[] args)
{

var currentDirectory = Environment.CurrentDirectory + "/../../";
var lemmatizerFilePath = currentDirectory + "../Test/Data/full7z-mlteast-en.lem";

Expand All @@ -23,23 +24,17 @@ static void Main(string[] args)

var enricherFilePaths = Directory.GetFiles(currentDirectory + "Input/");

EnrichLemmatizerFile(lemmatizerFilePath, outputFilePath, enricherFilePaths);

Console.WriteLine("OK");
Console.ReadKey();
}

private static void EnrichLemmatizerFile(string lemmatizerFilePath, string outputFilePath,
IEnumerable<string> enricherFilePaths)
{
using (var stream = File.OpenRead(lemmatizerFilePath))
{
// create base lemmatizer with data in the base source file
var lemmatizer = new Lemmatizer(stream);
// enrich lemmatizer with every other file

// then, enrich lemmatizer with every other files
foreach (var filePath in enricherFilePaths)
{
EnrichLemmatizer(lemmatizer, filePath);
}
{
EnrichLemmatizerWithDataFile(lemmatizer, filePath);
}

// persist lemmatizer in output file
Console.WriteLine("Writing output file...");
Expand All @@ -49,17 +44,21 @@ private static void EnrichLemmatizerFile(string lemmatizerFilePath, string outpu
}
Console.WriteLine("Outuput file written at {0}", outputFilePath);
}

Console.WriteLine("OK");
Console.ReadKey();
}

private static void EnrichLemmatizer(Lemmatizer lemmatizer, string enricherFilePath)

private static void EnrichLemmatizerWithDataFile(Lemmatizer lemmatizer, string enricherFilePath)
{
var fileReader = new EnricherFileReader(enricherFilePath);
var newLemmas = fileReader.ReadAllLemmaEntries();

EnrichLemmatizer(lemmatizer, newLemmas);
EnrichLemmatizerWithExamples(lemmatizer, newLemmas);
}

private static void EnrichLemmatizer(Lemmatizer lemmatizer, IEnumerable<Tuple<string, string, int>> wordsAndLemmaToAdd)
private static void EnrichLemmatizerWithExamples(Lemmatizer lemmatizer, IEnumerable<Tuple<string, string, int>> wordsAndLemmaToAdd)
{
// add new words and lemma
foreach (var wordAndLemma in wordsAndLemmaToAdd)
Expand All @@ -70,12 +69,15 @@ private static void EnrichLemmatizer(Lemmatizer lemmatizer, IEnumerable<Tuple<st

private static void AddExampleOrException(Lemmatizer lemmatizer, string word, string lemma)
{
// compute the lemma of this example
var computedLemma = lemmatizer.Lemmatize(word);

if(computedLemma != lemma)
{
// add example
// if the computed lemma is different from what we expect,
// add this example to lemmatizer (lemmatizer can then deduce a new rule and succeed, or still fail)
lemmatizer.AddExample(word, lemma);

// if still doesn't work --> add exception
var computedLemma2 = lemmatizer.Lemmatize(word);
if (computedLemma2 != lemma)
Expand Down
Binary file modified Test/Data/Custom/full7z-mlteast-en-modified.lem
Binary file not shown.
29 changes: 27 additions & 2 deletions Test/Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ static void Main(string[] args)
// add examples
var examples = new List<Tuple<string, string>>()
{
new Tuple<string,string>("acting","act"),
/*new Tuple<string,string>("acting","act"),
new Tuple<string,string>("balled","ball"),
new Tuple<string,string>("balled","ball"),
new Tuple<string,string>("ballsed","balls"),
Expand Down Expand Up @@ -75,7 +75,32 @@ static void Main(string[] args)
new Tuple<string,string>("vacuumed","vacuum"),
new Tuple<string,string>("whiled","while"),
new Tuple<string,string>("wigged","wig"),
new Tuple<string,string>("zoned","zone"),
new Tuple<string,string>("zoned","zone"),*/
new Tuple<string,string>("don't","do"),
new Tuple<string,string>("doesn't","do"),
new Tuple<string,string>("didn't","did"),
new Tuple<string,string>("won't","will"),
new Tuple<string,string>("shan't","shall"),
new Tuple<string,string>("can't","can"),
new Tuple<string,string>("couldn't","could"),
new Tuple<string,string>("wouldn't","would"),
new Tuple<string,string>("shouldn't","should"),
new Tuple<string,string>("mustn't","must"),
new Tuple<string,string>("mightn't","might"),
new Tuple<string,string>("oughtn't","ought"),
new Tuple<string,string>("needn't","need"),
new Tuple<string,string>("aren't","are"),
new Tuple<string,string>("isn't","be"),
new Tuple<string,string>("wasn't","be"),
new Tuple<string,string>("weren't","be"),
new Tuple<string,string>("haven't","have"),
new Tuple<string,string>("hasn't","have"),
new Tuple<string,string>("hadn't","have"),
new Tuple<string,string>("'s", "'s"),
new Tuple<string,string>("'ve", "have"),
new Tuple<string,string>("'m", "be"),
new Tuple<string,string>("'re", "be"),
new Tuple<string,string>("'ll", "will"),
};
foreach (var example in examples)
{
Expand Down

0 comments on commit e479b48

Please sign in to comment.