-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathsummarize.py
More file actions
274 lines (229 loc) · 9.12 KB
/
summarize.py
File metadata and controls
274 lines (229 loc) · 9.12 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
'''
Outputs most helpful review, summary, and evaluation
of summary from input data path
Example usage as script:
python summarize.py reviews_CDs_and_Vinyl_5.json.gz
The pipeline is as follows:
-process input reviews (cleaning + sentence segmentation)
-encode all sentences
-cluster encodings to get candidate points
-decode candidates
-optimize candidates to maximize BERT score
-evaluate
'''
import numpy as np
import pandas as pd
import gzip
import nltk
import pickle
import evaluation as ev
import torch
import torchtext
#import indicoio
import utils
import sys
import json
from pathlib import Path
from extractive.helpers import find_clusters, sample
from utils.dataset import Dataset
from bert_finetune.BERTEval import BERTpredictor
from autotransformer.transformer.flow import make_model
from autotransformer.summary_ae_datahandler import make_sentence_iterator, greedy_decode
from pytorch_pretrained_bert.modeling import BertForSequenceClassification, BertConfig
def load_config(config):
"""Loads Models and Data from paths listed in the initial config"""
config['device'] = torch.device("cuda" if torch.cuda.is_available() else "cpu")
sys.modules['dataset'] = utils.dataset
#data = utils.dataset.Dataset().get_from_dir(Path(config['data_path']))
with open(config['data_path'], 'r') as fp:
data = json.load(fp)
config['dataset'] = data
if not config['extractive']:
src_vocab = torch.load(Path(config['src_vocab_path']))
trg_vocab = torch.load(Path(config['trg_vocab_path']))
model = make_model(len(src_vocab), len(trg_vocab))
model.load_state_dict(torch.load(Path(config['autoencoder_path'])))
config['src_vocab'] = src_vocab
config['trg_vocab'] = trg_vocab
config['autoencoder'] = model
bert_config = BertConfig(Path(config['BERT_config_path']).__str__())
bert = BertForSequenceClassification(bert_config, num_labels=1)
bert.load_state_dict(torch.load(Path(config['BERT_finetune_path'])))
config['BERT_finetune_model'] = bert
#return config
def encode(sentences, config):
'''
[encode sentence] returns a list of sentence encodings
sentences: str list
returns 2d numpy array of encodings, shape: (n_sents, dim)
'''
if config['extractive']:
API_KEY = "Private - contact if you need it!"
indicoio.config.api_key = API_KEY
encodings = indicoio.text_features(sentences, version = 2)
return encodings
else:
model = config['autoencoder']
model.eval()
model.cuda()
sent_data = make_sentence_iterator(sentences, config['ae_batchsize'])
sent_iter, SRC, BOS_WORD, EOS_WORD, BLANK_WORD, CLS_WORD = sent_data
SRC.vocab = config['src_vocab']
encodings = []
for i, batch in enumerate(sent_iter):
src = batch.src.transpose(0, 1).cuda()
src_mask = (src != SRC.vocab.stoi[BLANK_WORD]).unsqueeze(-2).cuda()
batch_encodings = model.encode(src, src_mask)
for sent_encoding in batch_encodings:
encodings.append(sent_encoding[0,:].cpu().detach().numpy())
return np.array(encodings)
#TODO: implement
def cluster(encodings, sentences, config):
#encodings can be list of lists or 2d numpy array; this casting is to
#prevent list of numpy arrays, which breaks some indexing operations
encodings = np.asarray(encodings)
if config['extractive']:
sentence_labels, num_clusters = find_clusters(encodings, config)
candidate_sentences = sample(sentences, sentence_labels, encodings,
num_clusters, config)
return candidate_sentences
else:
sentence_labels, _ = find_clusters(encodings, config)
means = []
for cluster in set(sentence_labels):
if cluster == -1:
continue
cluster_indices = np.where(sentence_labels == cluster)
cluster_core_samples = encodings[cluster_indices]
average = np.mean(cluster_core_samples, axis = 0)
means.append(average)
#this returns a list of numpy arrays
return means
#TODO: implement
def decode(candidate_points, config):
if config['extractive']:
return candidate_points
else:
return greedy_decode(config['autoencoder'], candidate_points, config['trg_vocab'])
#TODO: implement
def optimize(candidate_sents, config):
bert = BERTpredictor(config, candidate_sents)
best_sents = peter_optimizer(bert, candidate_sents, config)[0]
review = ' '.join([bert.sentences[i] for i in best_sents])
return review
def peter_optimizer(bert, candidate_sents, config):
config['opt_dict']['max_sentence_ind'] = len(candidate_sents)#software gore
print("number of candidate sentences: ", len(candidate_sents))
config['opt_dict']['eval_class'] = bert#straight-up software murder
length_range = config['opt_dict']['length_range']
len_X = config['opt_dict']['optimize_population']
X = []
sent_range = np.arange(0, len(candidate_sents), 1)
for i in range(len_X):
x_len = np.random.randint(length_range[0], length_range[1])
'''x = []
for j in range(x_len):
x.append(np.random.randint(0, len(candidate_sents)))
'''
x = None
if x_len <= len(candidate_sents):
x = np.random.choice(sent_range, size = (x_len), replace = False).tolist()
else:
x = np.random.choice(sent_range, size = (x_len), replace = True).tolist()
X.append(x)
return config['opt_function'].optimize(X, config)
def evaluate(hyp_text, ref_text, hyp_enc, ref_enc):
'''
return evaluation of hypothesis text (our output) compared to
reference text (gold standard)
rouge score in form of tuple (f-score, precision, recall)
cosine similarity in the form of float [0,1]
param [hyp_text]: string of our summary text
param [ref_text]: string of gold standard summary
param [hyp_enc]: 1d numpy array of our summary embedding
param [ref_enc]: 1d numpy array of gold standard embedding,
must be same dimensions as above
'''
eval_dict = ev.evaluate_rouge(hyp_text, ref_text)
eval_dict['cos_sim'] = ev.evaluate_embeddings(hyp_enc, ref_enc)
return eval_dict
def print_first_5(lst):
'''
print first 5 elements of lst
param [lst]: very long list that would take too
long to print fully
'''
print_str = '['
for x in range(4):
print_str += repr(lst[x])
print_str += '; \n'
print_str += '... ]'
print(print_str)
def summarize_product(sentences, config):
'''
param [sentences]: the list of all tokenized review sentences in corpus
'''
encodings = encode(sentences, config)
candidate_points = cluster(encodings, sentences, config)
candidate_sents = decode(candidate_points, config)
for c in candidate_sents:
print(c)
solution = optimize(candidate_sents, config)
return solution
def summarize_dataset(config):
load_config(config)
results = {}
for ix, sentences in config['dataset'].items():
sents = list(filter(lambda x: len(x) > 40, sentences))
output = summarize_product(sents, config)
print('Final Output:', output)
results[ix] = output
with open(config['save_path'], 'w') as outfile:
json.dump(results, outfile)
def summarize_datasets(config):
for ix, dataset_path in enumerate(config['dataset_path_list']):
config['dataset_path'] = dataset_path
config['save_path'] = config['dataset_save_list'][ix]
summarize_dataset(config)
if __name__ == "__main__":
from optimization.finetune_bert_genetic_optimizer import GeneticBertOptimizer
config = {
'save_path' : 'data/electronics_results.json',
'dataset_path' : 'autotransformer/data/electronics_dataset_1.pkl',
'dataset_path_list': [],
'dataset_save_list': [],
'dataset' : None,
'extractive' : False,
'device' : None,
'src_vocab_path' : 'models/electronics/src_vocab.pt',
'src_vocab' : None,
'trg_vocab_path' : 'models/electronics/trg_vocab.pt',
'autoencoder_path': 'models/electronics/electronics_autoencoder_epoch7_weights.pt',
'autoencoder' : None,
'ae_batchsize': 5000,
'density_parameter' : 2,
'minimum_samples': 2,
'min_clusters': 50,
'max_acceptable_clusters': 200,
'min_num_candidates': 250,
'BERT_finetune_path' : 'models/electronics/finetune_electronics_mae_mk31.pt',
'BERT_config_path' : 'models/electronics/finetune_electronics_mae_mk31config.json',
'BERT_finetune_model' : None,
'BERT_batchsize': 25,
'length_penalty_order': 1.5,
'opt_function' : GeneticBertOptimizer(),
'opt_dict' : {
'optimize_population': 96,#for optimization methods with a population at optimization estimates,
#this is the number of optimization estimates used by the algorithm
'n_elite': 5,
'length_range': (5,20),
'length_penalty_range': (0.4, 1.0),
'p_replace': .33,
'p_remove': .33,
'p_add': .33,
'prevent_dupe_sents': True,
'max_iter': 10,
'print_iters': 2
}
}
summarize_dataset(config)