-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathstring2SOMETHING.py
More file actions
58 lines (48 loc) · 2.21 KB
/
string2SOMETHING.py
File metadata and controls
58 lines (48 loc) · 2.21 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import gensim
from gensim.models import Word2Vec
import numpy as np
import smart_open
# Define constants
CHUNK_SIZE = 16
TOP_K = 1
# Function to calculate cosine similarity for vector search
def cosine_similarity(vec_a, vec_b):
epsilon = 1e-10
if np.any(np.isnan(vec_a)) or np.any(np.isnan(vec_b)):
return -1 # Return a low similarity for invalid vectors
return np.dot(vec_a, vec_b) / (np.linalg.norm(vec_a) * np.linalg.norm(vec_b) + epsilon)
# Function to read and preprocess text into chunks
def read_and_preprocess(file_path, chunk_size=CHUNK_SIZE):
with smart_open.smart_open(file_path, encoding="utf-8") as f:
chunk = []
for line in f:
words = gensim.utils.simple_preprocess(line)
chunk.extend(words)
if len(chunk) >= chunk_size:
yield chunk
chunk = []
# Function to train Word2Vec model on the document
def train_word2vec(corpus):
return Word2Vec(sentences=corpus, vector_size=100, window=5, min_count=1, workers=4)
# Function to get vector representation of a sentence
def get_sentence_vector(model, sentence):
words = gensim.utils.simple_preprocess(sentence)
word_vectors = [model.wv[word] for word in words if word in model.wv]
if not word_vectors: # Check for empty word_vectors
return np.zeros(model.vector_size)
return np.mean(word_vectors, axis=0)
# Function to find the most relevant chunk given an input string and file path
def find_most_relevant_chunk(input_sentence, file_path):
corpus = list(read_and_preprocess(file_path))
model = train_word2vec(corpus)
user_vector = get_sentence_vector(model, input_sentence)
corpus_vectors = [get_sentence_vector(model, ' '.join(chunk)) for chunk in corpus]
cosine_results = sorted([(i, cosine_similarity(user_vector, doc_vector))
for i, doc_vector in enumerate(corpus_vectors)],
key=lambda x: x[1], reverse=True)[:TOP_K]
return ' '.join(corpus[cosine_results[0][0]])
# Example Usage
file_path = 'SOMETHING.txt' # Replace with your file path
input_sentence = "Your input string here"
output_chunk = find_most_relevant_chunk(input_sentence, file_path)
print(output_chunk)