Skip to content

Commit

Permalink
added dailymail dataset to summarization training
Browse files Browse the repository at this point in the history
  • Loading branch information
plkmo committed Dec 27, 2019
1 parent b836587 commit e0820f3
Show file tree
Hide file tree
Showing 6 changed files with 55 additions and 19 deletions.
14 changes: 13 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,19 @@ Text summarization aims to distil a paragraph chunk into a few sentences that ca
2. Seq2Seq (LAS architecture)

### Format of dataset files
One .csv file for each text/summary pair. Within the text/summary .csv file, text is followed by summary, with summary points annotated by @highlights (summary)
One .csv file for each text/summary pair. Within the text/summary .csv file, text is followed by summary, with summary points annotated by @highlights (summary)
Eg. example.csv
```bash
Main text here
@highlight

Summary 1

@highlight

Summary 2

```

### Running the model
Run summarize.py with arguments below
Expand Down
2 changes: 1 addition & 1 deletion nlptoolkit/summarization/models/InputConv_Transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ def Attention(q, k, v, dh, mask=None, g_mask=None, dropout=None):
scores = torch.matmul(q, k.transpose(-2,-1))/math.sqrt(dh)
if mask is not None:
mask = mask.unsqueeze(1); #print("Mask", mask.shape); print("scores", scores.shape)
scores = scores.masked_fill(mask == 0, -1e9)
scores = scores.masked_fill(mask == 0, -5e4)

if g_mask is not None:
scores = scores + g_mask
Expand Down
44 changes: 32 additions & 12 deletions nlptoolkit/summarization/preprocessing_funcs.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,18 +40,18 @@ def clean_and_tokenize_text(text, table, tokenizer, clean_only=False):
text = [w for w in text if not any(char.isdigit() for char in w)]
return text

def get_CNN_data(args, load_extracted=True):
def get_data(args, load_extracted=True):
"""
Extracts CNN dataset, saves then
Extracts CNN and/or dailymain dataset, saves then
returns dataframe containing body (main text) and highlights (summarized text)
table: table containing symbols to remove from text
tokenizer: tokenizer to tokenize text into word tokens
"""
path = args.data_path
path = args.data_path1
tokenizer_en = tokener()
table = str.maketrans("", "", '"#$%&\'()*+-/:;<=>@[\\]^_`{|}~')
if load_extracted:
df = load_pickle("df_unencoded_CNN.pkl")
df = load_pickle("df_unencoded.pkl")
else:
logger.info("Extracting CNN stories...")
df = pd.DataFrame(index=[i for i in range(len(os.listdir(path)))], columns=["body", "highlights"])
Expand All @@ -66,8 +66,28 @@ def get_CNN_data(args, load_extracted=True):
body = text[:re.search("@highlight", text).span(0)[0]]
df.iloc[idx]["body"] = body
df.iloc[idx]["highlights"] = highlights
save_as_pickle("df_unencoded_CNN.pkl", df)

if len(args.data_path2) > 2:
path = args.data_path2
logger.info("Extracting dailymail stories...")
df1 = pd.DataFrame(index=[i for i in range(len(os.listdir(path)))], columns=["body", "highlights"])
for idx, file in tqdm(enumerate(os.listdir(path)), total=len(os.listdir(path))):
with open(os.path.join(path, file), encoding="utf8") as csv_file:
csv_reader = csv.reader(csv_file)
text = ""
for row in csv_reader:
text += "".join(t for t in row)
highlights = re.search("@highlight(.*)", text).group(1)
highlights = highlights.replace("@highlight", ". ")
body = text[:re.search("@highlight", text).span(0)[0]]
df1.iloc[idx]["body"] = body
df1.iloc[idx]["highlights"] = highlights
df = pd.concat([df, df1], ignore_index=True)
del df1

save_as_pickle("df_unencoded.pkl", df)
logger.info("Dataset length: %d" % len(df))

if (args.level == "word") or (args.level == "char"):
logger.info("Tokenizing and cleaning extracted text...")
df.loc[:, "body"] = df.apply(lambda x: clean_and_tokenize_text(x["body"], table, tokenizer_en), axis=1)
Expand All @@ -85,7 +105,7 @@ def get_CNN_data(args, load_extracted=True):
df.loc[:, "body"] = df.apply(lambda x: v.convert_w2idx(x["body"]), axis=1)
df.loc[:, "highlights"] = df.apply(lambda x: v.convert_w2idx(x["highlights"]), axis=1)
df.loc[:, "highlights"] = df.apply(lambda x: pad_sos_eos(x["highlights"], 0, 2), axis=1)
save_as_pickle("df_encoded_CNN.pkl", df)
save_as_pickle("df_encoded.pkl", df)
save_as_pickle("vocab.pkl", v)

elif args.level == "bpe":
Expand Down Expand Up @@ -116,7 +136,7 @@ def get_CNN_data(args, load_extracted=True):
df.loc[:, "highlights"] = df.apply(lambda x: pad_sos_eos(x["highlights"], encoder.word_vocab["__sos"], encoder.word_vocab["__eos"]),\
axis=1)

save_as_pickle("df_encoded_CNN.pkl", df)
save_as_pickle("df_encoded.pkl", df)
encoder.save("./data/vocab.pkl")
return df

Expand Down Expand Up @@ -165,14 +185,14 @@ def load_dataloaders(args):
"""Load processed data if exist, else do preprocessing and loads it. Feeds preprocessed data into dataloader,
returns dataloader """
logger.info("Loading dataloaders...")
p_path = os.path.join("./data/", "df_unencoded_CNN.pkl")
train_path = os.path.join("./data/", "df_encoded_CNN.pkl")
p_path = os.path.join("./data/", "df_unencoded.pkl")
train_path = os.path.join("./data/", "df_encoded.pkl")
if (not os.path.isfile(p_path)) and (not os.path.isfile(train_path)):
df = get_CNN_data(args, load_extracted=False)
df = get_data(args, load_extracted=False)
elif os.path.isfile(p_path) and (not os.path.isfile(train_path)):
df = get_CNN_data(args, load_extracted=True)
df = get_data(args, load_extracted=True)
elif os.path.isfile(train_path):
df = load_pickle("df_encoded_CNN.pkl")
df = load_pickle("df_encoded.pkl")

trainset = text_dataset(df, args)
max_features_length = trainset.max_x_len
Expand Down
2 changes: 2 additions & 0 deletions nlptoolkit/summarization/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ def train_and_fit(args):
losses_per_epoch, accuracy_per_epoch = load_results(model_no=args.model_no)

batch_update_steps = int(train_length/(args.batch_size*10))

logger.info("Number of training data points: %d" % train_length)
logger.info("Starting training process...")
optimizer.zero_grad()
for e in range(start_epoch, args.num_epochs):
Expand Down
2 changes: 1 addition & 1 deletion nlptoolkit/summarization/utils/word_char_level_vocab.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

class tokener(object):
def __init__(self, lang="en"):
d = {"en":"en_core_web_sm", "fr":"fr_core_news_sm"}
d = {"en":"en_core_web_lg", "fr":"fr_core_news_sm"}
self.ob = spacy.load(d[lang])

def tokenize(self, sent):
Expand Down
10 changes: 6 additions & 4 deletions summarize.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,23 +16,25 @@

if __name__ == "__main__":
parser = ArgumentParser()
parser.add_argument("--data_path", type=str, default="C:/Users/WT/Desktop/Python_Projects/NLP/TextSummarisation/cnn_stories/cnn/stories/",\
parser.add_argument("--data_path1", type=str, default="./data/summarize_data/datasets/cnn_stories/cnn/stories/",\
help="Full path to CNN dataset")
parser.add_argument("--data_path2", type=str, default="./data/summarize_data/datasets/dailymail_stories/dailymail/stories/",\
help="Full path to dailymail dataset (leave as empty string if none)")
parser.add_argument("--level", type=str, default="bpe", help="Level of tokenization (word, char or bpe)")
parser.add_argument("--bpe_word_ratio", type=float, default=0.7, help="Ratio of BPE to word vocab")
parser.add_argument("--bpe_vocab_size", type=int, default=7000, help="Size of bpe vocab if bpe is used")
parser.add_argument("--max_features_length", type=int, default=200, help="Max length of features (word, char or bpe level)")
parser.add_argument("--max_features_length", type=int, default=1000, help="Max length of features (word, char or bpe level)")
parser.add_argument("--d_model", type=int, default=128, help="Transformer model dimension")
parser.add_argument("--ff_dim", type=int, default=128, help="Transformer Feed forward layer dimension")
parser.add_argument("--num", type=int, default=6, help="Transformer number of layers per block")
parser.add_argument("--n_heads", type=int, default=4, help="Transformer number of attention heads")
parser.add_argument("--LAS_embed_dim", type=int, default=128, help="LAS Embedding dimension")
parser.add_argument("--LAS_hidden_size", type=int, default=128, help="LAS listener hidden_size")
parser.add_argument("--batch_size", type=int, default=32, help="Batch size")
parser.add_argument("--batch_size", type=int, default=16, help="Batch size")
parser.add_argument("--fp16", type=int, default=1, help="1: use mixed precision ; 0: use floating point 32")
parser.add_argument("--num_epochs", type=int, default=8000, help="No of epochs")
parser.add_argument("--lr", type=float, default=0.0003, help="learning rate")
parser.add_argument("--gradient_acc_steps", type=int, default=2, help="Number of steps of gradient accumulation")
parser.add_argument("--gradient_acc_steps", type=int, default=4, help="Number of steps of gradient accumulation")
parser.add_argument("--max_norm", type=float, default=1.0, help="Clipped gradient norm")
parser.add_argument("--T_max", type=int, default=5000, help="number of iterations before LR restart")
parser.add_argument("--model_no", type=int, default=0, help="Model ID: 0 = Transformer, 1 = LAS")
Expand Down

0 comments on commit e0820f3

Please sign in to comment.