merging, fixing errors

stanfordnlp · Apr 2, 2017 · d52bb7f · d52bb7f
1 parent a581061
commit d52bb7f
Show file tree

Hide file tree

Showing 915 changed files with 153,478 additions and 100,719 deletions.
diff --git a/README.md b/README.md
@@ -1,17 +1,17 @@
 Stanford CoreNLP
 ================
 
-Stanford CoreNLP provides a set of natural language analysis tools written in Java. It can take raw human language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, and mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It was originally developed for English, but now also provides varying levels of support for Arabic, (mainland) Chinese, French, German, and Spanish. Stanford CoreNLP is an integrated framework, which make it very easy to apply a bunch of language analysis tools to a piece of text. Starting from plain text, you can run all the tools on it with just two lines of code. Its analyses provide the foundational building blocks for higher-level and domain-specific text understanding applications. Stanford CoreNLP is a set of stable and well-tested natural language processing tools, widely used by various groups in academia, government, and industry.
+Stanford CoreNLP provides a set of natural language analysis tools written in Java. It can take raw human language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize and interpret dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases or word dependencies, and indicate which noun phrases refer to the same entities. It was originally developed for English, but now also provides varying levels of support for (Modern Standard) Arabic, (mainland) Chinese, French, German, and Spanish. Stanford CoreNLP is an integrated framework, which make it very easy to apply a bunch of language analysis tools to a piece of text. Starting from plain text, you can run all the tools with just two lines of code. Its analyses provide the foundational building blocks for higher-level and domain-specific text understanding applications. Stanford CoreNLP is a set of stable and well-tested natural language processing tools, widely used by various groups in academia, industry, and government. The tools variously use rule-based, probabilistic machine learning, and deep learning components.
 
-The Stanford CoreNLP code is written in Java and licensed under the GNU General Public License (v3 or later). Note that this is the full GPL, which allows many free uses, but not its use in proprietary software that you distribute.
+The Stanford CoreNLP code is written in Java and licensed under the GNU General Public License (v3 or later). Note that this is the full GPL, which allows many free uses, but not its use in proprietary software that you distribute to others.
 
 You can find releases of Stanford CoreNLP on [Maven Central](http:https://search.maven.org/#browse%7C11864822).
 
 You can find more explanation and documentation on [the Stanford CoreNLP homepage](http:https://nlp.stanford.edu/software/corenlp.shtml#Demo).
 
 The most recent models associated with the code in the HEAD of this repository can be found [here](http:https://nlp.stanford.edu/software/stanford-corenlp-models-current.jar).
 
-For information about making contributions to Stanford CoreNLP, see the file `CONTRIBUTING.md`.
+For information about making contributions to Stanford CoreNLP, see the file [CONTRIBUTING.md](CONTRIBUTING.md).
 
 Questions about CoreNLP can either be posted on StackOverflow with the tag [stanford-nlp](http:https://stackoverflow.com/questions/tagged/stanford-nlp), 
  or on the [mailing lists](http:https://nlp.stanford.edu/software/corenlp.shtml#Mail).
diff --git a/build.gradle b/build.gradle
@@ -47,6 +47,7 @@ task listDeps << {
 
 dependencies {
  compile fileTree(dir: 'lib', include: '*.jar')
+ testCompile fileTree(dir: 'liblocal', include: '*.jar')
 }
 
 // Eclipse plugin setup

diff --git a/build.xml b/build.xml
@@ -26,6 +26,10 @@
  <include name="*.jar"/>
  <exclude name="javanlp*"/>
  </fileset>
+ <fileset dir="${basedir}/liblocal">
+ <include name="*.jar"/>
+ <exclude name="javanlp*"/>
+ </fileset>
  </path>
  </target>
 
@@ -124,6 +128,11 @@
  <compilerarg value="-Xmaxwarns"/>
  <compilerarg value="10000"/> -->
  </javac>
+ <copy todir="${build.path}/edu/stanford/nlp/pipeline/demo">
+ <fileset dir="${source.path}/edu/stanford/nlp/pipeline/demo">
+ <exclude name="**/*.java"/>
+ </fileset>
+ </copy>
  </target>
 
  <target name="test" depends="classpath,compile"

diff --git a/data/edu/stanford/nlp/patterns/surface/example.properties b/data/edu/stanford/nlp/patterns/surface/example.properties
@@ -17,16 +17,16 @@ outDir=SPIEDPatternsout
 #Number of threads available on the machine
 numThreads=1
 #***Use these options if you are limited by memory
-batchProcessSents = true
+batchProcessSents = false
 #This name is a misnomer. Max number of *lines* per batch file. Works only for text file format; ser files cannot be broken down
 numMaxSentencesPerBatchFile=100
-saveInvertedIndex=true
+saveInvertedIndex=false
 invertedIndexDirectory=${outDir}/invertedIndex
 #Loading index from invertedIndexDirectory
 #loadInvertedIndex=true
 
 #Useful for memory heavy apps. 
-invertedIndexClass=edu.stanford.nlp.patterns.LuceneSentenceIndex
+#invertedIndexClass=edu.stanford.nlp.patterns.LuceneSentenceIndex
 
 
 ### Example for running it on presidents biographies. For more data examples, see the bottom of this file
@@ -43,7 +43,7 @@ saveSentencesSerDir=${outDir}/sents
 #fileFormat=ser
 #file=${outDir}/sents
 
-#We are learning names of presidential candidates, places, and other names
+#We are learning names of presidential candidates, places, and other names. In each line, all text after tabs are ignored in these seed files
 seedWordsFiles=NAME,${DIR}/names.txt;PLACE,${DIR}/places.txt;OTHER,${DIR}/otherpeople.txt
 #Useful for matching lemmas or spelling mistakes
 fuzzyMatch=false
@@ -103,7 +103,7 @@ targetAllowedTagsInitialsStr=NAME,N;OTHER,N
 computeAllPatterns = true
 
 #Options: MEMORY, DB, LUCENE. If using SQL for storing patterns for each token --- populate SQLConnection class, that is provide those properties!
-storePatsForEachToken=LUCENE
+storePatsForEachToken=MEMORY
 #***If your code is running too slow, try to reduce this number. Samples % of sentences for learning patterns
 sampleSentencesForSufficientStats=1.0
 

diff --git a/data/edu/stanford/nlp/ud/feature_map.txt b/data/edu/stanford/nlp/ud/feature_map.txt
@@ -0,0 +1,106 @@
+* NN Number=Sing
+* NNP Number=Sing
+* NNS Number=Plur
+* NNPS Number=Plur
+* VBZ Number=Sing|Person=3|Tense=Pres|VerbForm=Fin|Mood=Ind
+* VBD VerbForm=Fin|Mood=Ind|Tense=Past
+* VBN Tense=Past|VerbForm=Part
+* VBP VerbForm=Fin|Mood=Ind|Tense=Pres
+* MD VerbForm=Fin
+* JJ Degree=Pos
+* JJR Degree=Cmp
+* JJS Degree=Sup
+* CD NumType=Card
+am VBP VerbForm=Fin|Mood=Ind|Tense=Pres|Person=1|Number=Sing
+was VBD VerbForm=Fin|Mood=Ind|Tense=Past|Number=Sing
+i PRP Number=Sing|Person=1|PronType=Prs|Case=Nom
+you PRP Person=2|PronType=Prs
+he PRP Number=Sing|Person=3|Gender=Masc|PronType=Prs|Case=Nom
+she PRP Number=Sing|Person=3|Gender=Fem|PronType=Prs|Case=Nom
+it PRP Number=Sing|Person=3|Gender=Neut|PronType=Prs
+we PRP Number=Plur|Person=1|PronType=Prs|Case=Nom
+they PRP Number=Plur|Person=3|PronType=Prs|Case=Nom
+me PRP Number=Sing|Person=1|PronType=Prs|Case=Acc
+him PRP Number=Sing|Person=3|Gender=Masc|PronType=Prs|Case=Acc
+her PRP Number=Sing|Person=3|Gender=Fem|PronType=Prs|Case=Acc
+us PRP Number=Plur|Person=1|PronType=Prs|Case=Acc
+them PRP Number=Plur|Person=3|PronType=Prs|Case=Acc
+my PRP$ Number=Sing|Person=1|Poss=Yes|PronType=Prs
+mine PRP$ Number=Sing|Person=1|Poss=Yes|PronType=Prs
+your PRP$ Person=2|Poss=Yes|PronType=Prs
+yours PRP$ Person=2|Poss=Yes|PronType=Prs
+his PRP$ Number=Sing|Person=3|Gender=Masc|Poss=Yes|PronType=Prs
+her PRP$ Number=Sing|Person=3|Gender=Fem|Poss=Yes|PronType=Prs
+hers PRP$ Number=Sing|Person=3|Gender=Fem|Poss=Yes|PronType=Prs
+its PRP$ Number=Sing|Person=3|Gender=Neut|Poss=Yes|PronType=Prs
+our PRP$ Number=Plur|Person=1|Poss=Yes|PronType=Prs
+ours PRP$ Number=Plur|Person=1|Poss=Yes|PronType=Prs
+their PRP$ Number=Plur|Person=3|Poss=Yes|PronType=Prs
+theirs PRP$ Number=Plur|Person=3|Poss=Yes|PronType=Prs
+myself PRP Number=Sing|Person=1|PronType=Prs
+yourself PRP Number=Sing|Person=2|PronType=Prs
+himself PRP Number=Sing|Person=3|Gender=Masc|PronType=Prs
+herself PRP Number=Sing|Person=3|Gender=Fem|PronType=Prs
+itself PRP Number=Sing|Person=3|Gender=Neut|PronType=Prs
+ourselves PRP Number=Plur|Person=1|PronType=Prs
+yourselves PRP Number=Plur|Person=2|PronType=Prs
+themselves PRP Number=Plur|Person=3|PronType=Prs
+the DT Definite=Def|PronType=Art
+a DT Definite=Ind|PronType=Art
+an DT Definite=Ind|PronType=Art
+this DT PronType=Dem|Number=Sing
+that DT PronType=Dem|Number=Sing
+these DT PronType=Dem|Number=Plur
+those DT PronType=Dem|Number=Plur
+here RB PronType=Dem
+there RB PronType=Dem
+then RB PronType=Dem
+whose WP$ Poss=Yes
+hard RB Degree=Pos
+fast RB Degree=Pos
+late RB Degree=Pos
+long RB Degree=Pos
+high RB Degree=Pos
+easy RB Degree=Pos
+early RB Degree=Pos
+far RB Degree=Pos
+soon RB Degree=Pos
+low RB Degree=Pos
+close RB Degree=Pos
+well RB Degree=Pos
+badly RB Degree=Pos
+little RB Degree=Pos
+harder RBR Degree=Cmp
+faster RBR Degree=Cmp
+later RBR Degree=Cmp
+longer RBR Degree=Cmp
+higher RBR Degree=Cmp
+easier RBR Degree=Cmp
+quicker RBR Degree=Cmp
+earlier RBR Degree=Cmp
+further RBR Degree=Cmp
+farther RBR Degree=Cmp
+sooner RBR Degree=Cmp
+slower RBR Degree=Cmp
+lower RBR Degree=Cmp
+closer RBR Degree=Cmp
+better RBR Degree=Cmp
+worse RBR Degree=Cmp
+less RBR Degree=Cmp
+hardest RBS Degree=Sup
+fastest RBS Degree=Sup
+latest RBS Degree=Sup
+longest RBS Degree=Sup
+highest RBS Degree=Sup
+easiest RBS Degree=Sup
+quickest RBS Degree=Sup
+earliest RBS Degree=Sup
+furthest RBS Degree=Sup
+farthest RBS Degree=Sup
+soonest RBS Degree=Sup
+slowest RBS Degree=Sup
+lowest RBS Degree=Sup
+closest RBS Degree=Sup
+best RBS Degree=Sup
+worst RBS Degree=Sup
+least RBS Degree=Sup
diff --git a/data/edu/stanford/nlp/upos/ENUniversalPOS.tsurgeon b/data/edu/stanford/nlp/upos/ENUniversalPOS.tsurgeon
@@ -64,7 +64,7 @@ relabel target AUX
 %relabel target AUX
 
 % VB.* -> AUX (active, case 1)
-VP < VP < (/^VB.*$/=target <... {/.*/})
+VP < VP < (/^VB.*$/=target <: /^(?i:will|have|can|would|do|is|was|be|are|has|could|should|did|been|may|were|had|'ll|'ve|does|am|might|ca|'m|being|'s|must|'d|'re|wo|shall|get|ve|s|got|r|m|getting|having|d|re|ll|wilt|v|of|my|nt|gets|du|wud|woud|with|willl|wil|wase|shoul|shal|`s|ould|-ll|most|made|hvae|hav|cold|as|art|ai|ar|a)$/)
 
 relabel target AUX
 
@@ -78,8 +78,13 @@ relabel target AUX
 
 relabel target VERB
 
-% IN -> SCONJ (only in case of subordinating conjunctions)
-/^SBAR(-[^ ]+)?$/ < (IN=target $++ S|FRAG <... {/.*/})
+% IN -> SCONJ (subordinating conjunctions)
+/^SBAR(-[^ ]+)?$/ < (IN=target $++ @S|FRAG|SBAR|SINV <... {/.*/})
+
+relabel target SCONJ
+
+% IN -> SCONJ (subordinating conjunctions II)
+@PP < (IN=target $+ @SBAR|S)
 
 relabel target SCONJ
 
@@ -109,7 +114,7 @@ NFP=target <... {/.*/}
 relabel target SYM
 
 % RB -> PART when it is verbal negation (not or its reductions)
-@VP|SINV|SQ|FRAG < (RB=target < /^(?i:not|n't|nt|t|n)$/)
+@VP|SINV|SQ|FRAG|ADVP < (RB=target < /^(?i:not|n't|nt|t|n)$/)
 
 relabel target PART
 
@@ -118,6 +123,16 @@ RB=target <... {/.*/}
 
 relabel target ADV
 
+% DT -> PRON (pronominal this/that/these/those)
+@NP <: (DT=target < /^[Tt]h(is|at|ose|ese)$/)
+
+relabel target PRON
+
+%DT -> DET
+DT=target < __
+
+relabel target DET
+
 % ------------------------------
 % 1 to 1 mappings
 %
@@ -132,11 +147,6 @@ CD=target <... {/.*/}
 
 relabel target NUM
 
-% DT -> DET
-DT=target <... {/.*/}
-
-relabel target DET
-
 % EX -> PRON
 EX=target <... {/.*/}
 

diff --git a/doc/README b/doc/README
@@ -8,3 +8,5 @@ zips we release
 
 classify, lexparser, ner, segmenter: documentation included in various
 packages, such as readmes, build files, etc
+
+loglinear: architectural explanation and various tutorials
diff --git a/doc/corenlp/pom-full.xml b/doc/corenlp/pom-full.xml
@@ -43,7 +43,7 @@
  <dependency>
  <groupId>joda-time</groupId>
  <artifactId>joda-time</artifactId>
- <version>2.1</version>
+ <version>2.9</version>
  </dependency>
  <dependency>
  <groupId>de.jollyday</groupId>
@@ -60,6 +60,11 @@
  <artifactId>javax.json-api</artifactId>
  <version>1.0</version>
  </dependency>
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-api</artifactId>
+ <version>1.7.12</version>
+ </dependency>
  </dependencies>
  <build>
  <sourceDirectory>src</sourceDirectory>