-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtrainCRF.py
More file actions
383 lines (324 loc) · 13.3 KB
/
trainCRF.py
File metadata and controls
383 lines (324 loc) · 13.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
#given BIO tagged texts, trains a CRF to label isnads in unseen texts using token level fearures
#
#If crossval is passed as an argument, leave one out cross validation is performed and the results for each
# fold are written to a json file.
#Without crossval, the model is saved to the output location.
import sys
import json
import pickle
import argparse
import random
from tqdm import tqdm
import numpy as np
from sklearn.model_selection import cross_validate,cross_val_predict,KFold
from sklearn.metrics import make_scorer
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
def getVocab(paragraphs):
vocab = []
for p in tqdm(paragraphs,desc="Creating vocab"):
for token in p["tokens"]:
if token not in vocab:
vocab.append(token)
return vocab
def readVectors(file):
#TODO: Handle unknown tokens? Does glove make an UNK embedding?
#creates the feature dictionary for the ith token of a given text
#read in word embeddings
# the filename should eventually be an argument to the program
vectors = {}
with open(file,"r",encoding="utf8") as infile:
for line in infile:
#skip the last line, if its empty
if len(line) > 1:
#determine the size of the vectors
vectorList = line.split(" ")
vectorSize = len(vectorList)-1
term = vectorList[0]
dims = vectorList[1:]
vectors[term] = np.array([float(val) for val in dims])
#print(list(vectors.keys())[:10])
return vectors,vectorSize
#converts a long document into a series of shorter documents, repeatedly using different offsets
# each time
# in cases where the sub-document does not begin or end a complete document, the document is padded
# with dummy data to prevent the model from learning to start embedded genres with I tags
def chunkTaggedSequences(data,sampleCount,chunkSize):
#find the offsets we will use to move the points where the documents are split
offsets = []
while len(offsets) < sampleCount: offsets.append(random.randint((-1*chunkSize)+1,0))
#print(offsets)
#print(data[0].keys())
chunkedData = []
documentID = 0
#split each tag sequence up into subsequences of chunkSize beginning at [offset+chunkSize]
for document in data:
document["sourceDocument"] = -1
tokens = document["tokens"]
tags = document["tags"]
for offset in offsets:
currentToken = offset
while currentToken < len(document["tokens"]):
newDoc = {}
newDoc["bookID"] = document["bookID"]
newDoc["paragraphNumber"] = document["paragraphNumber"]
newDoc["sourceDocument"] = documentID
#actually split the tokens and tags
startpoint = max(currentToken,0)
endpoint = min(currentToken+chunkSize,len(tags))
#newDoc["startIndex"] = startpoint
#newDoc["endIndex"] = endpoint
if startpoint > 0:
newDoc["tags"] = ["PAD_TAG"]*windowLen
newDoc["tokens"] = ["PAD_TOKEN"]*windowLen
else:
newDoc["tags"] = []
newDoc["tokens"] = []
newDoc["tags"] += tags[startpoint:endpoint+1]
newDoc["tokens"] += tokens[startpoint:endpoint+1]
if endpoint < len(tags):
newDoc["tags"] += ["PAD_TAG"]*windowLen
newDoc["tokens"] += ["PAD_TOKEN"]*windowLen
currentToken += chunkSize
chunkedData.append(newDoc)
documentID += 1
return chunkedData
def makeTrainTestSplits(numTestDocs,allDocs,trainOnChunks = False):
splits = []
for testIndex in range(numTestDocs):
#get the indices of all the documents that aren't derived from the current test document or
# are thensolves complete test documents
if trainOnChunks:
trainIndices = [j for j in range(len(allDocs)) if allDocs[j]["sourceDocument"] not in [testIndex,-1]]
else:
trainIndices = [j for j in range(len(allDocs)) if j!=testIndex]
splits.append((trainIndices,[testIndex]))
return splits
def featurizeTokenCounts(windowSize=5,count=True):
#creates the feature dictionary for the ith token of a given text
def _featureize(text,i):
features = {}
#add the current token
features["token"] = text[i]
#add features for the counts of tokens preceeding the current one
# and those following the current one.
# for token in vocab:
# features[token+"_b"] = 0
# features[token+"_a"] = 0
windowBegin = max(i-windowSize,0)
windowEnd = min(i+1+windowSize,len(text))
for j in range(windowBegin,i):
otherToken = text[j]
featName = otherToken+"_b"
if featName not in features:
features[featName] = 0
if count:
features[featName] += 1
else:
features[featName] = 1
for j in range(i+1,windowEnd):
otherToken = text[j]
featName = otherToken+"_a"
if featName not in features:
features[featName] = 0
if count:
features[featName] += 1
else:
features[featName] = 1
return features
return _featureize
def featurizeWordEmbeddings(vectorSize,windowSize=5):
def _featureize(text,i):
features = {}
#add the current token
#if i <=100: print("Featurizing token %s at index %d"%(text[i],i))
if text[i] in vectors:
tokenVector = vectors[text[i]]
else:
#print("missing token %s in position %d"%(text[i],i))
tokenVector = [0.0]*vectorSize
for j in range(len(tokenVector)):
features["t_%d"%j] = tokenVector[j]
windowBegin = max(i-windowSize,0)
windowEnd = min(i+1+windowSize,len(text))
for j in range(windowBegin,i):
diff = i-j
if text[j] in vectors:
tokenVector = vectors[text[j]]
else:
#if j<=100: print("missing token %s in position %d"%(text[j],j))
tokenVector = [0.0]*vectorSize
for k in range(len(tokenVector)):
features["t-%d_%d"%(diff,k)] = tokenVector[k]
for j in range(i+1,windowEnd):
diff = j-i
if text[j] in vectors:
tokenVector = vectors[text[j]]
else:
#if j<=100: print("missing token %s in position %d"%(text[j],j))
tokenVector = [0.0]*vectorSize
for k in range(len(tokenVector)):
features["t+%d_%d"%(diff,k)] = tokenVector[k]
return features
return _featureize
parser = argparse.ArgumentParser()
parser.add_argument("--chunkSize",default=200,help="Maximum token length of training chunks",type=int)
parser.add_argument("--sampleCount",default=0,help="Number of different offsets to use",type=int)
parser.add_argument("--crossval",action='store_true')
parser.add_argument("--crossvalPredict",action='store_true')
parser.add_argument("--features",choices=["vectors","tokens"],help="Defines what type of features to use when training this model, either token frequencies or word embeddings",required=True)
parser.add_argument("--windowSize",default=5,help="The size of the window (in tokens) to look both ahead of and behind the token being featurized to creature feaures")
parser.add_argument("infile")
parser.add_argument("outfile")
args = parser.parse_args()
infile = args.infile
outfile = args.outfile
crossVal = args.crossval
crossValPredict = args.crossvalPredict
chunkSize = args.chunkSize
sampleCount = args.sampleCount
featureType = args.features
windowLen = args.windowSize
#read in the data
data = []
f = open(infile,"r",encoding="utf8")
for line in f.readlines():
data.append(json.loads(line))
#remove untagged data
print("Read %d paragraphs"%len(data))
unchunkedData = [d for d in data if len(d["tags"])>0]
print("Read %d tagged sections"%len(data))
if sampleCount > 0:
print("Splitting texts into multiple chunks")
trainingData = chunkTaggedSequences(unchunkedData,sampleCount,chunkSize)
else:
trainingData = unchunkedData
# f = open(outfile,"w",encoding="utf8")
# for entry in trainingData:
# json.dump(entry,f,ensure_ascii=False)
# f.write("\n")
#create the feature extracting function
if featureType == "tokens":
featurizer = featurizeTokenCounts(windowSize=windowLen,count=True)
elif featureType == "vectors":
print("Reading vectors")
vectors,vectorSize = readVectors("vectors_priOnly.txt")
featurizer = featurizeWordEmbeddings(vectorSize,windowSize=windowLen)
print("Read %d vectors of size %d"%(len(vectors),vectorSize))
else:
print("Unknown feature type: %s"%featureType)
sys.exit(0)
#extract features and create the X and Y datasets
if sampleCount > 0:
numDocs = len(unchunkedData)+len(trainingData)
numTestDocs = len(unchunkedData)
allDocs = unchunkedData+trainingData
else:
numDocs = len(trainingData)
numTestDocs = len(unchunkedData)
allDocs = trainingData
X = []
Y = []
print("Featurizing documents")
for doc in tqdm(allDocs):
featureVec = []
text = doc["tokens"]
tags = doc["tags"]
for i in range(len(text)):
featureVec.append(featurizer(text,i))
X.append(featureVec)
Y.append(tags)
crf = sklearn_crfsuite.CRF(
algorithm='lbfgs',
c1=0.1,
c2=0.1,
max_iterations=100
)
if crossVal:
#this is not a good idea for large datasets, as it performs LOO cross validation
trainTestSplits = makeTrainTestSplits(numTestDocs,allDocs,trainOnChunks = False)
evalMetrics = {"precision":make_scorer(sklearn_crfsuite.metrics.flat_precision_score,average="macro"),
"recall":make_scorer(sklearn_crfsuite.metrics.flat_recall_score,average="macro"),
"f1":make_scorer(sklearn_crfsuite.metrics.flat_f1_score,average="macro"),
"sequence_acc":make_scorer(sklearn_crfsuite.metrics.sequence_accuracy_score)}
#this needs to be reworked to properly use only the documents not derived from the test document to
# train models while cross-validating
results = cross_validate(crf,X,Y,scoring=evalMetrics,cv=trainTestSplits,n_jobs=8,return_estimator=True)
print(results.keys())
#add the true and predicted values for each held out sequence
# this probably doesn't use the right held out sequence.
# The ordering of the models is probably interleaved due to parallelism
true = [Y[i] for i in range(numTestDocs)]
predicted = [results["estimator"][i].predict_single(X[i]) for i in range(numTestDocs)]
for i in range(numTestDocs): assert len(true[i])==len(predicted[i])
results["true"] = true
results["predicted"] = predicted
#write the results, one per fold, to a json file
outfile = open(outfile,"w",encoding="utf8")
for i in range(numTestDocs):
singleResult = {}
trueVals = results["true"][i]
predictedVals = results["predicted"][i]
# print(i,len(trueVals),len(predictedVals))
# print("True: %s"%trueVals)
# print("Pred: %s"%predictedVals)
singleResult["precision"] = results["test_precision"][i]
singleResult["recall"] = results["test_recall"][i]
singleResult["f1"] = results["test_f1"][i]
singleResult["sequence_acc"] = results["test_sequence_acc"][i]
singleResult["true"] = trueVals
singleResult["predicted"] = predictedVals
singleResult["tokens"] = data[i]["tokens"]
# singleResult["calculated_precision"] = sklearn_crfsuite.metrics.flat_precision_score([trueVals],[predictedVals],average="macro")
# singleResult["calculated_recall"] = sklearn_crfsuite.metrics.flat_recall_score([trueVals],[predictedVals],average="macro")
# singleResult["calculated_f1"] = sklearn_crfsuite.metrics.flat_f1_score([trueVals],[predictedVals],average="macro")
# singleResult["calculated_sequence_acc"] = sklearn_crfsuite.metrics.sequence_accuracy_score([trueVals],[predictedVals])
outfile.write(json.dumps(singleResult,ensure_ascii=False)+"\n")
outfile.close()
elif crossValPredict:
#rather than evaluating on the folds, this will write the results from cross validation on
# each instance to a file using 10fold CV
#it does this in series, and might take ages
#results = cross_val_predict(crf,X,Y,cv=10,verbose=0,n_jobs=-1)
#iterate over the folds
#write the results, one per instance, to a json file
outfile = open(outfile,"w",encoding="utf8")
kf = KFold(n_splits=10,shuffle=True)
for trainIndices,testIndices in kf.split(X,Y):
print(len(testIndices))
#get train and test data
Xtrain = [X[i] for i in trainIndices]
Xtest = [X[i] for i in testIndices]
Ytrain = [Y[i] for i in trainIndices]
#fit the model to the training data
crf.fit(Xtrain,Ytrain)
#get the results for the test data
results = crf.predict(Xtest)
for i,testIndex in zip(range(len(testIndices)),testIndices):
singleResult = {}
trueVals = Y[testIndex]
predictedVals = results[i]
singleResult["true"] = trueVals
singleResult["predicted"] = predictedVals
singleResult["tokens"] = data[testIndex]["tokens"]
singleResult["id"] = data[testIndex]["id"]
# singleResult["calculated_precision"] = sklearn_crfsuite.metrics.flat_precision_score([trueVals],[predictedVals],average="macro")
# singleResult["calculated_recall"] = sklearn_crfsuite.metrics.flat_recall_score([trueVals],[predictedVals],average="macro")
# singleResult["calculated_f1"] = sklearn_crfsuite.metrics.flat_f1_score([trueVals],[predictedVals],average="macro")
# singleResult["calculated_sequence_acc"] = sklearn_crfsuite.metrics.sequence_accuracy_score([trueVals],[predictedVals])
outfile.write(json.dumps(singleResult,ensure_ascii=False)+"\n")
outfile.close()
else:
#only fit the model to the chunked data, if it exists
if sampleCount > 0:
print("Fitting model to %d training examples"%len(X[numTestDocs:]))
crf.fit(X[numTestDocs:],Y[numTestDocs:])
#otherwise, just fit to everything
else:
print("Fitting model to %d training examples"%len(X))
crf.fit(X,Y)
print("Saving model")
f = open(outfile,"wb")
pickle.dump(crf,f)
f.close()