-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtagger.py
More file actions
58 lines (47 loc) · 2.16 KB
/
tagger.py
File metadata and controls
58 lines (47 loc) · 2.16 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import numpy as np
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
import os
import pickle
class UdmurtPOSTagger:
@staticmethod
def _ignore_accuracy():
import tensorflow as tf
def ignore_accuracy(y_true, y_pred):
y_true_class = tf.argmax(y_true, axis=-1)
y_pred_class = tf.argmax(y_pred, axis=-1)
ignore_mask = tf.cast(tf.not_equal(y_true_class, 0), 'int32')
matches = tf.cast(tf.equal(y_true_class, y_pred_class), 'int32') * ignore_mask
accuracy = tf.reduce_sum(matches) / tf.maximum(tf.reduce_sum(ignore_mask), 1)
return accuracy
return ignore_accuracy
def __init__(self):
current_dir = os.path.dirname(os.path.abspath(__file__))
resources_dir = os.path.join(current_dir, 'resources')
# Load model
model_path = os.path.join(resources_dir, 'udmurt_pos_tagger_model.h5')
self.model = load_model(
model_path,
custom_objects={'ignore_accuracy': self._ignore_accuracy()}
)
# Load dictionaries
with open(os.path.join(resources_dir, 'udmurt_pos_word2index.pkl'), 'rb') as f:
self.word2index = pickle.load(f)
with open(os.path.join(resources_dir, 'udmurt_pos_index2tag.pkl'), 'rb') as f:
self.index2tag = pickle.load(f)
# Get max sequence length from model
self.max_length = self.model.input_shape[1]
def predict(self, tokens):
# Convert tokens to indices
sequence = []
for token in tokens:
token_lower = token.lower()
sequence.append(self.word2index.get(token_lower, self.word2index['-OOV-']))
# Apply padding
padded_sequence = pad_sequences([sequence], maxlen=self.max_length, padding='post')
# Predict tags
prediction = self.model.predict(padded_sequence)
# Convert predictions to tags
tag_indices = prediction[0][:len(tokens)].argmax(axis=-1)
tags = [self.index2tag[idx] for idx in tag_indices]
return tags