fix/readme

ningkko · Jan 20, 2022 · 2ac6cad · 2ac6cad
1 parent 7ba8565
commit 2ac6cad
Show file tree

Hide file tree

Showing 2 changed files with 61 additions and 34 deletions.
diff --git a/README.md b/README.md
@@ -8,7 +8,8 @@ This repo contains the new `Tweebank-NER` [dataset](./Tweebank-NER-v1.0) and `Tw
 
 ```
 # please install from the source
-pip install -e .
+pip install -e ./twitter-stanza
+pip install pythainlp
 
 # download glove and pre-trained models
 sh download_twitter_resources.sh
@@ -19,51 +20,59 @@ sh download_twitter_resources.sh
 ```python
 import stanza
 
-# config for the `en_tweet` pipeline (trained only on Tweebank)
+# config for the `en_tweet` models (models trained only on Tweebank)
 config = {
  'processors': 'tokenize,lemma,pos,depparse,ner',
  'lang': 'en',
  'tokenize_pretokenized': True, # disable tokenization
- 'tokenize_model_path': './saved_models/tokenize/en_tweet_tokenizer.pt',
- 'lemma_model_path': './saved_models/lemma/en_tweet_lemmatizer.pt',
- "pos_model_path": './saved_models/pos/en_tweet_tagger.pt',
- "depparse_model_path": './saved_models/depparse/en_tweet_parser.pt',
- "ner_model_path": './saved_models/ner/en_tweet_nertagger.pt'
+ 'tokenize_model_path': './twitter-stanza/saved_models/tokenize/en_tweet_tokenizer.pt',
+ 'lemma_model_path': './twitter-stanza/saved_models/lemma/en_tweet_lemmatizer.pt',
+ "pos_model_path": './twitter-stanza/saved_models/pos/en_tweet_tagger.pt',
+ "depparse_model_path": './twitter-stanza/saved_models/depparse/en_tweet_parser.pt',
+ "ner_model_path": './twitter-stanza/saved_models/ner/en_tweet_nertagger.pt',
+ "scheme": "bio"
 }
 
 # Initialize the pipeline using a configuration dict
+stanza.download("en")
 nlp = stanza.Pipeline(**config)
 doc = nlp("Oh ikr like Messi better than Ronaldo but we all like Ronaldo more")
 print(doc) # Look at the result
 ```
 
-## Running Twitter-Stanza (Command Line Interface)
+## Command-line Interface for Twitter-Stanza
 
 ### NER
 
 We provide two pre-trained Stanza NER models:
-- `en_tweenut17`: trained on `TB2+WNUT17`
+- `en_tweetwnut17`: trained on `TB2+WNUT17`
 - `en_tweet`: trained on `TB2`
 
 ```
 source twitter-stanza/scripts/config.sh
+cd twitter-stanza
 
-python stanza/utils/training/run_ner.py en_tweenut17 \
+shorthand=en_tweetwnut17
+python stanza/utils/training/run_ner.py ${shorthand} \
 --mode predict \
 --score_test \
 --wordvec_file ../data/wordvec/English/en.twitter100d.xz \
---eval_file data/ner/en_tweet.test.json
+--eval_file data/ner/en_tweet.test.json \
+--save_dir ./saved_models/ner \
+--save_name ${shorthand}_nertagger.pt \
+--scheme bio
 ```
 
 ### Syntactic NLP Models
 
-We provide two pre-trained models for the following NLP tasks:
-- `tweet_ewt`: trained on `TB2+UD-English-EWT`
-- `en_tweet`: trained on `TB2`
+We provide two pre-trained models for each NLP task, please specify the following shorthand:
+- `en_tweetewt`: the model trained on `TB2+UD-English-EWT`
+- `en_tweet`: the model trained on `TB2`
 
 #### 1. Tokenization
 ```
-python stanza/utils/training/run_tokenizer.py tweet_ewt \
+shorthand=en_tweet 
+python stanza/utils/training/run_tokenizer.py ${shorthand} \
 --mode predict \
 --score_test \
 --txt_file data/tokenize/en_tweet.test.txt \
@@ -73,29 +82,34 @@ python stanza/utils/training/run_tokenizer.py tweet_ewt \
 
 #### 2. Lemmatization
 ```
-python stanza/utils/training/run_lemma.py tweet_ewt \
+shorthand=en_tweet
+python stanza/utils/training/run_lemma.py ${shorthand} \
 --mode predict \
 --score_test \
---gold_file data/depparse/en_tweet.test.gold.conllu \
---eval_file data/depparse/en_tweet.test.in.conllu 
+--gold_file data/lemma/en_tweet.test.gold.conllu \
+--eval_file data/lemma/en_tweet.test.in.conllu 
 ```
 
 #### 3. POS Tagging
 ```
-python stanza/utils/training/run_pos.py tweet_ewt \
+shorthand=en_tweetewt
+python stanza/utils/training/run_pos.py ${shorthand} \
 --mode predict \
 --score_test \
 --eval_file data/pos/en_tweet.test.in.conllu \
---gold_file data/depparse/en_tweet.test.gold.conllu 
+--gold_file data/pos/en_tweet.test.gold.conllu \
+--wordvec_file ../data/wordvec/English/en.twitter100d.xz \
+--load_name ./saved_models/pos/${shorthand}_tagger.pt
 ```
 
 #### 4. Dependency Parsing
 
 ``` 
-python stanza/utils/training/run_depparse.py tweet_ewt \
+shorthand=en_tweetewt
+python stanza/utils/training/run_depparse.py ${shorthand} \
 --mode predict \
 --score_test \
---wordvec_file ../data/wordvec/English/en.twitter100d.txt \
+--wordvec_file ../data/wordvec/English/en.twitter100d.xz \
 --eval_file data/depparse/en_tweet.test.in.conllu \
 --gold_file data/depparse/en_tweet.test.gold.conllu 
 ```
@@ -112,7 +126,7 @@ If you use this repository in your research, please kindly cite our paper as wel
 @article{jiang2022tweebank,
  title={Annotating the Tweebank Corpus on Named Entity Recognition and Building NLP Models for Social Media Analysis},
  author={Jiang, Hang and Hua, Yining and Beeferman, Doug and Roy, Deb},
- publisher={arXiv},
+ journal={arXiv preprint arXiv:2201.07281},
  year={2022}
 }
 ```

diff --git a/TRAIN_README.md b/TRAIN_README.md
@@ -3,54 +3,67 @@
 ## NER
 
 You can specify two data setttings:
-- `en_tweenut17`: trained on `TB2+WNUT17`
+- `en_tweetwnut17`: trained on `TB2+WNUT17`
 - `en_tweet`: trained on `TB2`
 
 ```
-python stanza/utils/training/run_ner.py en_tweenut17 \
+shorthand=en_tweetwnut17
+python stanza/utils/training/run_ner.py ${shorthand} \
 --wordvec_file ../data/wordvec/English/en.twitter100d.xz \
 --eval_file data/ner/en_tweet.dev.json
 ```
 
 ## Syntactic NLP Models
 You can specify two data settings:
-- `tweet_ewt`: trained on `TB2+UD-English-EWT`
+- `en_tweetewt`: trained on `TB2+UD-English-EWT`
 - `en_tweet`: trained on `TB2`
 
 #### 1. Tokenizer
 ```
+## assign the shorthand name
+shorthand=en_tweet
+
 ## Data Preparation
-python -m stanza.utils.datasets.prepare_tokenizer_treebank tweet_ewt --no_use_mwt
+python -m stanza.utils.datasets.prepare_tokenizer_treebank ${shorthand} --no_use_mwt
 
 ## Train
-python stanza/utils/training/tokenizer.py tweet_ewt --no_use_mwt
+python stanza/utils/training/run_tokenizer.py ${shorthand} --no_use_mwt
 ```
 #### 2. Lemmatization
 ```
+## assign the shorthand name
+shorthand=en_tweet
+
 ## Data Preparation
-python -m stanza.utils.datasets.prepare_lemma_treebank tweet_ewt 
+python -m stanza.utils.datasets.prepare_lemma_treebank ${shorthand} 
 
 ## Train
-python stanza/utils/training/run_lemma.py tweet_ewt 
+python stanza/utils/training/run_lemma.py ${shorthand} 
 ```
 #### 3. POS Tagging
 ```
+## assign the shorthand name
+shorthand=en_tweetewt
+
 ## Data Preparation
-python -m stanza.utils.datasets.prepare_pos_treebank tweet_ewt 
+python -m stanza.utils.datasets.prepare_pos_treebank ${shorthand} 
 
 ## Train
-python stanza/utils/training/run_pos.py tweet_ewt --wordvec_file ../data/wordvec/English/en.twitter100d.txt --no_pretrain
+python stanza/utils/training/run_pos.py ${shorthand} --wordvec_file ../data/wordvec/English/en.twitter100d.xz --no_pretrain
 ```
 
 #### 4. Dependency Parser
 
 ```
+## assign the shorthand name
+shorthand=en_tweet
+
 ## Data Preparation
 ## --gold would give you gold data. But according to conventions we did not use gold data for our depparse model
-python -m stanza.utils.datasets.prepare_depparse_treebank tweet_ewt 
+python -m stanza.utils.datasets.prepare_depparse_treebank ${shorthand} 
 
 ## Train
-python -m stanza.utils.datasets.prepare_depparse_treebank tweet_ewt 
+python stanza/utils/training/run_depparse.py ${shorthand} --wordvec_file ../data/wordvec/English/en.twitter100d.xz --no_pretrain
 ```
 
 ## Summary