diff --git a/q_and_a/.dockerignore b/q_and_a/.dockerignore new file mode 100644 index 0000000..01d7f95 --- /dev/null +++ b/q_and_a/.dockerignore @@ -0,0 +1,2 @@ +__pycache__ +venv \ No newline at end of file diff --git a/q_and_a/answer_selection.py b/q_and_a/answer_selection.py new file mode 100644 index 0000000..02e1793 --- /dev/null +++ b/q_and_a/answer_selection.py @@ -0,0 +1,155 @@ +#import +import torch +from torch.nn.functional import softmax +from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler +from tqdm import tqdm, trange +import io +import json +import os +import numpy as np +from transformers import ( + BertTokenizer, + BertForQuestionAnswering, +) +import argparse +import requests + + +#argments +def passed_arguments(): + parser = argparse.ArgumentParser(description="Script to evaluate model predictions.") + + parser.add_argument("--data_path", + type=str, + required=True, + help="Path to evaluation dataset") + args = parser.parse_args() + return args + + parser.add_argument("--top_n", + type=int, + required=False, + default=1, + help="Top n results to consider correct") + args = parser.parse_args() + return args + + parser.add_argument("--propose_cnt", + type=int, + required=False, + default=10, + help="Number of contexts to test question on") + args = parser.parse_args() + return args + + +def wrap_select(a, i, n): + """ + a - list to select from + i - index of a to start from + n - number of things to select + """ + b = [] + + #true answer as first possition + b.append(a[i]) + j = (i+1)%len(a) + while len(b) < n+1 : + if a[j] not in b: + b.append(a[(j)]) + j = (j+1)%len(a) + return b + + +def process_data_mult_text(question, contexts): + """ + Adds CLS, and SEP tokens to input text + Run through tokenizer + Returns tokenized + """ + tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad') + + input_text = [] + input_ids = [] + token_type_ids = [] + for i in range(len(contexts)): + text = "[CLS] " + question + " [SEP] " + contexts[i] + " [SEP]" + input_text.append(text) + + encoded = tokenizer.encode(text) + input_ids.append(encoded) + + token_type_id = [0 if i <= encoded.index(102) else 1 ] + token_type_ids.append(token_type_id) + + return input_text, input_ids, token_type_ids + + +def get_3110_set(data_path, include_impossible = False): + """Returns the 3110 dataset data. + """ + cwd = os.getcwd() + + question, text, answer, labels, ids, is_impossible = [], [], [], [], [], [] + with open(data_path) as f: + data = json.load(f)["data"] + + for d in data: + if (include_impossible or d["is_impossible"]==0 ): + is_impossible.append(1 if d["is_impossible"] else 0) + question.append(d["question"]) + text.append(d["context"]) + answer.append(d["answer"]) + ids.append(d["id"]) + + return question, text, answer, is_impossible + + + +def evaluate_on_multiple_context(questions, contexts, answers, eval_on = 10, top_n = 1): + + + model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad') + results = np.zeros(len(questions)) + + for i,q in enumerate(questions): + + #get possible passages to test question on + context_sub = wrap_select(contexts, i, eval_on) + + #format questions and context for bert input + input_text, input_ids, token_type_ids = process_data_mult_text(q, context_sub) + scores_avg = np.zeros(eval_on) + + #pass input through bert saving avg of max value in logits + for j in range (eval_on): + start_scores, end_scores = model(torch.tensor([input_ids[j]]), token_type_ids=torch.tensor([token_type_ids[j]]), return_dict=False) + soft_start = softmax(start_scores) + soft_end = softmax(end_scores) + scores_avg[j] = (torch.max(soft_start) + torch.max(soft_end))/2 + + arr = scores_avg.argsort()[-top_n:][::-1] + results[i] = 1 if 0 in arr else 0 + + + return results + + +def main(data_path, top_n, propose_cnt): + + print('Getting data') + questions, contexts, answers, is_impossible = get_3110_set(data_path) + + print("Beginning Evaluation") + results = evaluate_on_multiple_context(questions, contexts, answers, eval_on=p_cnt, top_n=top_n) + print("Finished Evaluation") + print("Acc: ", np.sum(results)/len(results)) + + +if __name__ == '__main__': + args = passed_arguments() + data_path = args.data_path + top_n = args.top_n + p_cnt = arg.propose_cnt + + main(data_path, top_n, p_cnt) \ No newline at end of file diff --git a/q_and_a/app.py b/q_and_a/app.py new file mode 100644 index 0000000..71220c2 --- /dev/null +++ b/q_and_a/app.py @@ -0,0 +1,195 @@ +from flask import Flask, request, render_template, make_response +from haystack.document_stores import FAISSDocumentStore +from haystack.nodes import PDFToTextConverter, PreProcessor, FARMReader, DensePassageRetriever +from haystack.pipelines import ExtractiveQAPipeline +from haystack.schema import Document +from pipeline import answer +import pymysql +import json +import os + +app = Flask(__name__) + +app.config['SQLALCHEMY_DATABASE_URI'] = 'mysql+pymysql://admin_mci:mycourseindex-qa@database-qa.cp4ury9dboly.us-east-1.rds.amazonaws.com/qa_docstore' +# app.config["input"] = "/usr/src/app/data/input" +app.config["input"] = "./data/input" +app.config["host"] = "0.0.0.0" + +def create_dpr(document_store): + retriever = DensePassageRetriever( + document_store=document_store, + query_embedding_model="facebook/dpr-question_encoder-single-nq-base", + passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base", + max_seq_len_query=64, + max_seq_len_passage=256, + batch_size=16, + use_gpu=True, + embed_title=True, + use_fast_tokenizers=True, + ) + # document_store.update_embeddings(retriever) + # document_store.save(index_path="haystack_test_faiss", config_path="haystack_test_faiss_config") + # reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True, progress_bar=False, top_k_per_candidate=2) + + return retriever + +def joinParagraph(str): + s = str.replace('\n', ' ').replace('*', '').replace('%temp%', 'e').replace('```{code-cell} ocaml', '').replace('\ ', '').replace('`', '') + return re.sub('\\\s',' ', s) + +@app.route("/query_pipe",methods=['POST']) +def query_pipe(): + q=request.form['question'] + len_ans=int(request.form['lenans']) + len_retriever=int(request.form['lenretr']) + prediction = pipe.run(query=q, params={"Retriever": {"top_k": len_retriever}, "Reader": {"top_k": len_ans}}) + doc_ids = [prediction['answers'][i].document_id for i in range(len_ans)] + docs = document_store.get_documents_by_id(doc_ids) + docs = [d.content for d in docs] + ans = [prediction['answers'][i].answer for i in range(len_ans)] + return json.dumps({ + 'status':'success', + 'message': 'Process succesfully', + 'result': ans, + 'context': docs}) + +@app.route("/query",methods=['POST']) +def query(): + q=request.form['question'] + len_ans=int(request.form['lenans']) + len_retriever=int(request.form['lenretr']) + context = retriever.retrieve(query=q, top_k=len_retriever) + # prediction = pipe.run(query=q, params={"Retriever": {"top_k": len_retriever}, "Reader": {"top_k": len_ans}}) + ans = answer(1, context, q) + return json.dumps({'status':'success','message': 'Process succesfully', 'result': ans}) + +@app.route('/') +def home(): + """Return a friendly HTTP greeting.""" + return 'Hello QNA API is running' + +#endpoint to update embedded method +@app.route('/set_embed', methods=['POST']) +def set_embed(): + """Return a friendly HTTP greeting.""" + # document_store.write_documents() + document_store.update_embeddings(retriever, update_existing_embeddings=False) + document_store.save("haystack_test_faiss", "haystack_test_faiss_config") + return json.dumps({'status':'Susccess','message': 'Sucessfully embeded method updated in FAISS Document', 'result': document_store.get_embedding_count()}) + +@app.route('/get_docs') +def get_docs(): + """Return a friendly HTTP greeting.""" + # document_store.write_documents() + res=document_store.get_all_documents()[0].content + return json.dumps({'status':'Susccess','message': 'Sucessfully embeded method updated in FAISS Document', 'result': res}) + + +@app.route('/update_docstore_pdf', methods=['POST']) +def update_document(): + """Return a the url of the index document.""" + if request.files: + prev = document_store.get_document_count() + # uploaded document for target source + doc = request.files["doc"] + file_path = os.path.join(app.config["input"], doc.filename) + # saving the file to the input directory + doc.save(file_path) + # convert the pdf files into dictionary and update to Document + converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=["en"]) + # doc_pdf = converter.convert(file_path=Path(f"{doc_dir}/3110.pdf"), meta=None)[0] + doc_pdf = converter.convert(file_path=filepath, meta=None)[0] + doc_pdf.content = join_Paragraph(doc_pdf.content) + + preprocessor = PreProcessor( + clean_empty_lines=True, + clean_whitespace=True, + clean_header_footer=False, + split_by="word", + split_length=100, + split_respect_sentence_boundary=True, + ) + docs_default = preprocessor.process(doc_pdf) + + document_store.write_documents(docs_default) + now = document_store.get_document_count() + # os.remove(file_path) + + return json.dumps( + {'status':'Susccess', 'result': {'file': doc.filename, 'prevcount': prev, "currcount": now}}) + else: + return json.dumps({'status':'Failed','message': 'No file uploaded', 'result': []}) + +@app.route('/update_docstore_json', methods=['POST']) +def update_json(): + # doc.seek(0) + # contents = doc.read() + prev = document_store.get_document_count() + filename = request.form['filename'] + file_path = os.path.join(app.config["input"], filename) + with open(str(file_path), 'r') as j: + contents = json.load(j) + + contentlists=[] + for x in contents: + # print(x.keys()) + ans = [i['content'] for i in x['_source']['answers']] + # x.clear() + # x['content'] = x['_source']['content'] + '.'.join(ans) + contentlists.append({'content': x['_source']['content'] + '.'.join(ans)}) + + field_map = {"content": "content"} + contentlists = [Document.from_dict(d) for d in contentlists] + + document_store.write_documents(contentlists) + os.remove(file_path) + now = document_store.get_document_count() + + return json.dumps( + {'status':'Susccess', 'result': {'file': filename, 'prevcount': prev, "currcount": now}}) + # else: + # return json.dumps({'status':'Failed','message': 'No file uploaded', 'result': []}) + + +@app.route('/upload_document', methods=['POST']) +def upload_document(): + """Return a the url of the index document.""" + if request.files: + # uploaded document for target source + prev = document_store.get_document_count() + doc = request.files["doc"] + file_path = os.path.join(app.config["input"], doc.filename) + # saving the file to the input directory + doc.save(file_path) + # with open(str(file_path), 'r') as j: + # doc.seek(0) + # contents = doc.read() + # for x in contents: + # ans = [i['content'] for i in x['_source']['answers']] + # x['content'] = x['_source']['content'] + '.'.join(ans) + # field_map = {"content": "content"} + # contentlists = [Document.from_dict(d) for d in content] + + # document_store.write_documents(contentlists) + # # os.remove(file_path) + now = document_store.get_document_count() + + return json.dumps( + {'status':'Susccess', 'result': {'file': doc.filename, 'prevcount': prev, "currcount": now}}) + else: + return json.dumps({'status':'Failed','message': 'No file uploaded', 'result': []}) + +if __name__ == '__main__': + port = int(os.environ.get("PORT", 5000)) +# document_store = ElasticsearchDocumentStore( +# host = cluster_ip, +# scheme="https", +# index='cs_4780_sp2021', +# username="mciesaccess", +# password="mcioscar" +# ) + document_store = FAISSDocumentStore.load(index_path="haystack_test_faiss", config_path="haystack_test_faiss_config") + retriever = create_dpr(document_store) + reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True, progress_bar=True, return_no_answer=True) + pipe = ExtractiveQAPipeline(reader, retriever) + app.run(host=app.config["host"], port=port, debug=True) \ No newline at end of file diff --git a/q_and_a/baseline_eval.py b/q_and_a/baseline_eval.py index f7273f0..641357f 100644 --- a/q_and_a/baseline_eval.py +++ b/q_and_a/baseline_eval.py @@ -39,7 +39,9 @@ def passed_arguments(): return args -# imp_toggle = true if want to include impossible +''' +Function to process json file into question, context, answer, impossible toggle, and the id +''' def process_json(data_path, imp_toggle): question = [] text = [] @@ -67,22 +69,25 @@ def process_data(question, text): return input_text -#returns the tokenizer and model associated with the model id -# 0 = bert -# 1 = distilbert +''' +Function that returns the tokenizer and model associated with the model id + 0 = bert + 1 = distilbert +''' def model_pick(id): if (id == 0): tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad') model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad') - if (id == 1): + elif (id == 1): tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-distilled-squad") model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased-distilled-squad") return tokenizer, model -#pre-trained bert runs on evaluation sets -#return list of tokens for each question id +''' +Function returns predictions of given model_id +''' def predictions(model_id, input_text, print_some_outputs = True): tokenizer, model = model_pick(model_id) @@ -114,8 +119,11 @@ def predictions(model_id, input_text, print_some_outputs = True): return preds -#returns the precision, recall, and f1 score -#preds and labels should be tokenized +''' +Function that returns metrics on the predicted values +Returns the precision, recall, and f1 score +Note: preds and labels should be tokenized +''' def evaluate(preds, labels, questions, ids, model_id): tokenizer, _ = model_pick(model_id) diff --git a/q_and_a/data/input/3110.pdf b/q_and_a/data/input/3110.pdf new file mode 100644 index 0000000..b4f9891 Binary files /dev/null and b/q_and_a/data/input/3110.pdf differ diff --git a/q_and_a/data/input/demo.pdf b/q_and_a/data/input/demo.pdf new file mode 100644 index 0000000..053c33a Binary files /dev/null and b/q_and_a/data/input/demo.pdf differ diff --git a/q_and_a/docker-compose.yml b/q_and_a/docker-compose.yml new file mode 100644 index 0000000..39ff977 --- /dev/null +++ b/q_and_a/docker-compose.yml @@ -0,0 +1,8 @@ +version: "3" +services: + qaapp: + image: qaapp + ports: + - "5000:5000" + restart: on-failure + diff --git a/q_and_a/dockerfile b/q_and_a/dockerfile new file mode 100644 index 0000000..7579c58 --- /dev/null +++ b/q_and_a/dockerfile @@ -0,0 +1,58 @@ +FROM python:3.7 + + +RUN apt-get update \ + && apt-get install -y locales \ + && apt-get update \ + && dpkg-reconfigure -f noninteractive locales \ + && locale-gen C.UTF-8 \ + && /usr/sbin/update-locale LANG=C.UTF-8 \ + && echo "en_US.UTF-8 UTF-8" >> /etc/locale.gen \ + && locale-gen \ + && apt-get install -y curl unzip \ + && apt-get clean \ + && apt-get autoremove + + +# Creating Application Source Code Directory +RUN mkdir -p /usr/src/app + +# Setting Home Directory for containers +WORKDIR /usr/src/app + +# Installing python dependencies +COPY requirements.txt /usr/src/app/ + +RUN apt-get install -y poppler-utils + +RUN pip install --no-cache-dir -r requirements.txt + +#get pdf converter +RUN wget --no-check-certificate https://dl.xpdfreader.com/xpdf-tools-linux-4.04.tar.gz && \ + tar -xvf xpdf-tools-linux-4.04.tar.gz && cp xpdf-tools-linux-4.04/bin64/pdftotext /usr/local/bin + +# Install package +RUN pip install --upgrade pip +RUN pip install --no-cache-dir .[docstores,preprocessing,ocr] + +# Copying src code to Container +COPY . /usr/src/app + +RUN chmod -R 777 /usr/src/app/data/input +RUN chmod -R 777 /usr/src/app/data +RUN chmod -R 777 /usr/src/app +RUN chmod -R 777 /usr/src + +# Application Environment variables +#ENV APP_ENV development +ENV PORT 5000 + +# Exposing Ports +EXPOSE $PORT + +# Setting Persistent data +VOLUME ["/app-data"] + +# Running Python Application +# CMD gunicorn -b :$PORT -c gunicorn.conf.py main:app +CMD python app.py diff --git a/q_and_a/faiss_document_store.db b/q_and_a/faiss_document_store.db new file mode 100644 index 0000000..1567749 Binary files /dev/null and b/q_and_a/faiss_document_store.db differ diff --git a/q_and_a/haystack_test_faiss b/q_and_a/haystack_test_faiss new file mode 100644 index 0000000..5644635 Binary files /dev/null and b/q_and_a/haystack_test_faiss differ diff --git a/q_and_a/haystack_test_faiss_config b/q_and_a/haystack_test_faiss_config new file mode 100644 index 0000000..215420a --- /dev/null +++ b/q_and_a/haystack_test_faiss_config @@ -0,0 +1 @@ +{"sql_url": "mysql+pymysql://admin_mci:mycourseindex-qa@database-qa.cp4ury9dboly.us-east-1.rds.amazonaws.com/qa", "faiss_index_factory_str": "Flat", "return_embedding": true, "embedding_dim": 768, "faiss_config_path": "haystack_test_faiss_config"} \ No newline at end of file diff --git a/q_and_a/local_index.faiss b/q_and_a/local_index.faiss new file mode 100644 index 0000000..63d8211 Binary files /dev/null and b/q_and_a/local_index.faiss differ diff --git a/q_and_a/local_index.json b/q_and_a/local_index.json new file mode 100644 index 0000000..90612dc --- /dev/null +++ b/q_and_a/local_index.json @@ -0,0 +1,5 @@ +{ + "faiss_index_factory_str": "Flat", + "embedding_dim": 768, + "faiss_config_path": "my_faiss_index.json" +} \ No newline at end of file diff --git a/q_and_a/my_faiss_index.faiss b/q_and_a/my_faiss_index.faiss new file mode 100644 index 0000000..1c1b460 Binary files /dev/null and b/q_and_a/my_faiss_index.faiss differ diff --git a/q_and_a/my_faiss_index.json b/q_and_a/my_faiss_index.json new file mode 100644 index 0000000..da64a57 --- /dev/null +++ b/q_and_a/my_faiss_index.json @@ -0,0 +1 @@ +{"faiss_index_factory_str": "Flat"} \ No newline at end of file diff --git a/q_and_a/pa_r_cossim.py b/q_and_a/pa_r_cossim.py new file mode 100644 index 0000000..58fcc7d --- /dev/null +++ b/q_and_a/pa_r_cossim.py @@ -0,0 +1,99 @@ + + +from datasets import load_dataset +import nltk +nltk.download('punkt') +import numpy as np +import random +import string +from models import InferSent +import torch + +''' +Sentences is the list of answers +Question is the list of questions +''' +def inferSent(sentences, question): + + V = 2 + MODEL_PATH = 'encoder/infersent%s.pkl' % V + params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, + 'pool_type': 'max', 'dpout_model': 0.0, 'version': V} + + #begin model training + infersent = InferSent(params_model) + infersent.load_state_dict(torch.load(MODEL_PATH)) + W2V_PATH = 'fastText/crawl-300d-2M.vec' + infersent.set_w2v_path(W2V_PATH) + infersent.build_vocab(sentences, tokenize=True) + embeddings = infersent.encode(sentences, tokenize=True) + q_emb = infersent.encode(question, tokenize = True) + + return embeddings, q_emb + +''' +Function to generate cossin similarity score matrix +''' +def cossin_sim_scores(sentences, question): + sent_norm = sentences / sentences.sum(axis=1)[:,np.newaxis] + ques_norm = question / question.sum(axis=1)[:,np.newaxis] + dot = np.dot(ques_norm, sent_norm.T) + print(dot.shape) + return dot + + +''' +Returns the accuracy of the model, each prediction is assigned 1 if the correct answer lies in the top n +answers and 0 if not +''' +def get_top_n(dotprods, n=5): + #sort the + print('Checking top ', n, ' articles') + hit = 0 + index = 0 + top_n = dotprods.argsort(axis=1)[:,-n:] + sort = dotprods.argsort(axis=1) + for i in range (len(dotprods)): + if i in top_n[i]: + hit +=1 + index += np.where(sort[i] == i)[0] + acc, avg = hit/len(dotprods), index/len(dotprods) + print(acc, avg) + return acc, avg + +def first(example): + e = '' + e = str(example['text'][0]) + return e + + +''' +Function that gets the list of question and answers from dataset +''' +def dataset(): + ds = load_dataset("eli5", split='train_eli5') + questions = ds['title'] + a = ds['answers'] + answers = (list(map(first, a))) + print('number of samples: ', len(answers)) + return questions, answers + + +def evaluate(): + #get the question and answers from the eli5 training set + questions, answers = dataset() + #get the inferSent embeddings of the questions and answers + embs, q_embs = inferSent(answers[:2000], questions[:2000]) + #compute the cossin similarity socres + scores = cossin_sim_scores(embs, q_embs) + #get top n accuracy + acc1 , avg1 = get_top_n(scores, n=10) + acc2 , avg2 = get_top_n(scores, n=100) + acc3 , avg3 = get_top_n(scores, n=1000) + + + + +if __name__ == '__main__': + evaluate() + diff --git a/q_and_a/passage_retrieval.py b/q_and_a/passage_retrieval.py new file mode 100644 index 0000000..7d534aa --- /dev/null +++ b/q_and_a/passage_retrieval.py @@ -0,0 +1,212 @@ +from haystack.nodes import TextConverter, PDFToTextConverter, PreProcessor, FARMReader, DensePassageRetriever +from haystack.document_stores import FAISSDocumentStore +from haystack.pipelines import ExtractiveQAPipeline, JoinDocuments +from haystack import Pipeline +from haystack.nodes import ElasticsearchRetriever, EmbeddingRetriever +from haystack.utils import launch_es +from haystack.document_stores import ElasticsearchDocumentStore +from pathlib import Path +import json +import re +from pipeline import convert_pdf_to_string, answer +import heapq + + +''' +Utilizes QA model of Haystack +retriever: retrieve top contexts, uses dense passage retrieval method +reader: does QA, uses roberta +returns the retriever and the whole pipeline +''' +def create_dpr(document_store): + retriever = DensePassageRetriever( + document_store=document_store, + query_embedding_model="facebook/dpr-question_encoder-single-nq-base", + passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base", + max_seq_len_query=64, + max_seq_len_passage=256, + batch_size=16, + use_gpu=True, + embed_title=True, + use_fast_tokenizers=True, + ) + # document_store.update_embeddings(retriever) + # document_store.save(index_path="haystack_test_faiss", config_path="haystack_test_faiss_config") + # reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True, progress_bar=False, top_k_per_candidate=2) + + return retriever + +''' +ensemble retriever +1 sparse retriever using elastic search +1 dense retriever using embedding retriever +''' +def ensemble_retriever(doc_pdf): +# Initialize DocumentStore and index documents + launch_es() + document_store2 = ElasticsearchDocumentStore() + document_store2.write_documents(doc_pdf) + es_retriever = ElasticsearchRetriever(document_store=document_store2) + + embedding_retriever = EmbeddingRetriever( + document_store2, + model_format="sentence_transformers", + embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1", + ) + document_store2.update_embeddings(embedding_retriever, update_existing_embeddings=False) + + p_ensemble = Pipeline() + p_ensemble.add_node(component=es_retriever, name="ESRetriever", inputs=["Query"]) + p_ensemble.add_node(component=embedding_retriever, name="EmbeddingRetriever", inputs=["Query"]) + p_ensemble.add_node( + component=JoinDocuments(join_mode="concatenate"), name="JoinResults", inputs=["ESRetriever", "EmbeddingRetriever"] + ) + p_ensemble.add_node(component=reader, name="Reader", inputs=["JoinResults"]) + + return p_ensemble + +''' +retrieve top len_retr passages, produce len_ans number of answers from each passage +return a list containing len_out of (answer, score) +''' +def retriever_reader_pipe(retriever, reader_id, query, len_retr, len_ans, len_out): + passages = retriever.retrieve(query=query, top_k=len_retr) + max_score = 0 + pred = [] + for passage in passages: + for i in range(len_ans): + ans, score = answer(reader_id, passage, query) + if len(pred) < len_out: + heapq.heappush(pred, (ans, score)) + elif score > pred[0][1]: + heapq.heappushpop(pred, (ans, score)) + return pred + + +''' +Function that counts number of common words in s0 and s1, +number of words in s0, and number of words in s1 +''' +def common_words(s0, s1): + s0 = s0.lower() + s1 = s1.lower() + s0List = s0.split(" ") + s1List = s1.split(" ") + return len(list(set(s0List)&set(s1List))), len(s0List), len(s1List) + +''' +Function that returns metrics on the predicted values +Returns the precision, recall, and f1 score +''' +def evaluate(preds, labels, questions, ids, len_ans): + n = len(labels) + precision = 0 + recall = 0 + f1 = 0 + + for i in range(n): + l = labels[i] + pr, re, f1_c = 0, 0, 0 + for j in range(len_ans): + p = preds[i][j] + if len(p) == 0 or len(l) == 0: + agree = 1 if len(p) == len(l) else 0 + pr = 1 + re = 1 + f1_c = 1 + else: + intersection, pl, ll = common_words(p,l) + pr = max((1.0*intersection)/pl, pr) + re = max((1.0*intersection)/ll, re) + + if pr <= 0.5: + print("question", questions[i]) + print("label", l) + print(preds[i]) + precision += pr + #calculate recall + recall += re + #calculate f1 + f1 += 0 if (pr == 0 or re == 0) else (2*pr*re)/(pr+re) + + if (pr == 0 or re == 0): + print("\nBad answer example ",ids[i], ': ', questions[i]) + print("Prediction: ", p) + print("Answer: ", l) + print() + + #average over all samples + precision = precision/n + recall = recall/n + f1 = f1/n + + return precision, recall, f1 + +def process_json(data_path, imp_toggle): + question = [] + text = [] + answer = [] + is_impossible = [] + ids = [] + with open(data_path) as f: + data = json.load(f)["data"] + + for d in data: + if (not d["is_impossible"] or imp_toggle): + question.append(d["question"]) + text.append(d["context"]) + answer.append(d["answer"]) + is_impossible.append(d["is_impossible"]) + ids.append(d["id"]) + return question, text, answer, is_impossible, ids + + +def joinParagraph(str): + s = str.replace('\n', ' ').replace('*', '').replace('%temp%', 'e').replace('```{code-cell} ocaml', '').replace('\ ', '').replace('`', '') + return re.sub('\\\s',' ', s) + +''' +turn files into the documents that can be processed by the Haystack pipeline +store files in document_store +''' +def create_passages(doc_dir, document_store): + converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=["en"]) + doc_pdf = converter.convert(file_path=Path(f"{doc_dir}/3110.pdf"), meta=None)[0] + doc_pdf.content = join_Paragraph(doc_pdf.content) + # doc_pdf = convert_pdf_to_string(file_path) + preprocessor = PreProcessor( + clean_empty_lines=True, + clean_whitespace=True, + clean_header_footer=False, + split_by="word", + split_length=100, + split_respect_sentence_boundary=True, + ) + docs_default = preprocessor.process(doc_pdf) + document_store.write_documents(docs_default) +''' +return evaluation of pipe on a data set +''' +def get_evaluation(data_path, len_ans, len_retr, pipe): + questions, text, labels, _, ids = process_json(data_path, False) + preds = [] + for q in questions: + prediction = pipe.run( + query=q, params={"Retriever": {"top_k": len_retr}, "Reader": {"top_k": len_ans}}) + ans = [prediction['answers'][i].answer for i in range(len_ans)] + preds.append(ans) + + p, r, f1 = evaluate(preds, labels, questions, ids, len_ans) + return p, r, f1 + +if __name__ == '__main__': + document_store = FAISSDocumentStore(faiss_index_factory_str="Flat", sql_url= "sqlite:///haystack_test_faiss.db") + document_store = document_store.load(index_path="haystack_test_faiss", config_path="haystack_test_faiss_config") + create_passages(doc_dir, document_store) + retriever, pipe = create_dpr(document_store) + + len_ans = 5 + len_retr = 10 + + p, r, f1 = get_evaluation(data_path, len_ans, len_retr, pipe) + print(p, r, f1) diff --git a/q_and_a/pipeline.py b/q_and_a/pipeline.py index 2c9800d..a9927b9 100644 --- a/q_and_a/pipeline.py +++ b/q_and_a/pipeline.py @@ -57,8 +57,8 @@ def model_pick(id): tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad') model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad') if (id == 1): - tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-distilled-squad") - model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased-distilled-squad") + tokenizer = AutoTokenizer.from_pretrained("deepset/roberta-base-squad2") + model = AutoModelForQuestionAnswering.from_pretrained("deepset/roberta-base-squad2") return tokenizer, model diff --git a/q_and_a/preprocess_data.py b/q_and_a/preprocess_data.py new file mode 100644 index 0000000..ce00010 --- /dev/null +++ b/q_and_a/preprocess_data.py @@ -0,0 +1,133 @@ +import torch +import pandas as pd +from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler +from tqdm import tqdm, trange +import pandas as pd +import io +import json +import os +import numpy as np +import random +from transformers import ( + BertTokenizer, + BertForQuestionAnswering, + BertModel +) + +#helper returning the most frequent of a an array: +def most_frequent(List): + counter = 0 + num = List[0] + for i in List: + curr_frequency = List.count(i) + if(curr_frequency> counter): + counter = curr_frequency + num = i + return num + +# 1 when is_impossible, else 0 +def process_json(data_path): + print("starting yay") + question = [] + text = [] + answer = [] + is_impossible = [] + ids = [] + num_questions = [] # number of questions asked on the same context + + r = requests.get(url = data_path) + data = r.json()["data"] + data_len = len(data) + for d in data: + for p in d["paragraphs"]: + context = p["context"] + context_len = len(context) + num = 0 + for qa in p["qas"]: + quest = qa["question"] + quest_len = len(quest) + if (context_len+quest_len < 509): + question.append(quest) + if (len(qa["answers"])) != 0: + an = [] + for i in qa["answers"]: + an.append(i["text"]) + answer.append(most_frequent(an)) + + else: + answer.append("") + if qa["is_impossible"]: + is_impossible.append(1) + else: + is_impossible.append(0) + ids.append(qa["id"]) + num += 1 + if (num != 0): + text.append(context) + num_questions.append(num) + print("done") + + return question, text, answer, is_impossible, ids, num_questions + + +def process_data(question, text, num_questions): + print("oho") + input_text = [] + for i in range(len(num_questions)): + for j in range(num_questions[i]): + input_text.append( "[CLS] " + question[j] + " [SEP] " + text[i] + " [SEP]") + return input_text + + +def get_CLS(percentage): + question, text, answer, is_impossible, ids, num_questions = process_json("https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json") + input_text = process_data(question, text, num_questions) + print("done") + input_text = input_text[int(percentage*len(input_text)):] + is_impossible = is_impossible[int(percentage*len(is_impossible)):] + + print(len(input_text)) + tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + model = BertModel.from_pretrained('bert-base-uncased') + print("working") + input_ids = [] + max_len = 0 + for text in input_text: + input_id = tokenizer.encode(text) + input_ids.append(input_id) + if len(input_id) > max_len: + max_len = len(input_id) + + #append the maximum length of the sentence to the list of is_impossible + is_impossible.append(max_len) + is_impossible.to_csv("is_impossible.csv", encoding='utf-8', index=False) + + padded = np.array([i + [0]*(max_len - len(i)) for i in input_ids]) + attention_mask = np.where(padded != 0, 1, 0) + input_ids = torch.tensor(padded) + print(input_ids.size) + attention_mask = torch.tensor(attention_mask) + print(attention_mask.size) + with torch.no_grad(): + last_hidden_states = model(input_ids, attention_mask = attention_mask) + print(last_hidden_states) + features = pd.DataFrame(last_hidden_states[0][:,0,:].numpy()) + print(features) + features.to_csv("features.csv", encoding='utf-8', index=False) + + return features, is_impossible, max_len + +def logisticReg(): + #features,is_impossible, max_len = get_CLS() + is_impossible = pd.read_csv('is_impossible.csv') + max_len = is_impossible[-1] + is_impossible = is_impossible[:-1] + features = pd.read_csv('features.csv') + + log_model = LogisticRegression() + log_model.fit(features,is_impossible) + print("finished") + return log_model, max_len + +if __name__ == '__main__': + get_CLS() \ No newline at end of file diff --git a/q_and_a/requirements.txt b/q_and_a/requirements.txt new file mode 100644 index 0000000..4613dfe --- /dev/null +++ b/q_and_a/requirements.txt @@ -0,0 +1,5 @@ +Flask +farm-haystack +PyMySQL +pdfminer +transformers \ No newline at end of file diff --git a/q_and_a/sample.py b/q_and_a/sample.py index dfa0241..7c05318 100644 --- a/q_and_a/sample.py +++ b/q_and_a/sample.py @@ -17,7 +17,7 @@ for i in range(len(input_ids))] #4 start_scores, end_scores = model(torch.tensor([input_ids]), \ - token_type_ids=torch.tensor([token_type_ids])) + token_type_ids=torch.tensor([token_type_ids]), return_dict=False) #5 all_tokens = tokenizer.convert_ids_to_tokens(input_ids) print(' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1])) \ No newline at end of file diff --git a/q_and_a/unans_with_bert.py b/q_and_a/unans_with_bert.py new file mode 100644 index 0000000..c0ce918 --- /dev/null +++ b/q_and_a/unans_with_bert.py @@ -0,0 +1,398 @@ +import torch +from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler +from tqdm import tqdm, trange +import pandas as pd +import io +import json +import os +import numpy as np +import random +from transformers import ( + BertTokenizer, + BertForQuestionAnswering, + BertModel, + BertConfig, + AutoTokenizer, + AutoModelForQuestionAnswering +) +import argparse +from collections import Counter +from sklearn.linear_model import LogisticRegression +from sklearn.feature_selection import RFE +from sklearn import metrics +import requests + + +###GLOBAL VARIABLES +SQUAD2_TRAIN_URL = "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json" +CLS_EMBS_DATASET_INPUT = "cls_emb_input.npy" +EMBS_DATASET_LABELS = "squad_emb_labels.npy" +AVG_EMBS_DATASET_INPUT = "avg_emb_input.npy" +TEST_EMB_DATASET_INPUT = "3110_emb_input.npy" +TEST_EMB_DATASET_LABELS = "3110_emb_labels.npy" + +def passed_arguments(): + parser = argparse.ArgumentParser(description="Script to evaluate model predictions.") + parser.add_argument("--data_path", + type=str, + required=True, + help="Path to evaluation dataset") + parser.add_argument("--emb_type", + type=int, + required=False, + default=0, + help="0 if using cls embedding \n1 if using avg of token embeddings") + + args = parser.parse_args() + return args + +#helper returning the most frequent of a an array: +def most_frequent(List): + counter = 0 + num = List[0] + for i in List: + curr_frequency = List.count(i) + if(curr_frequency> counter): + counter = curr_frequency + num = i + return num + + +#################################################################################################### +################### Functions For Processing Data and Generating Datasets ################## + +def process_json(data_path): + """ + Function that processes squad data + Arguments: + data_path: the url of the data + Returns: + question: list of questions + text: list of contexts + answer: list of answers + is_impossible: if the question is impossible to answer + ids: the id of each entry + num_questions: how many questions each context has + """ + question, text, answer, is_impossible, ids, num_questions = [], [], [], [], [], [] + r = requests.get(url = data_path) + data = r.json()["data"] + data_len = len(data) + for d in data: + for p in d["paragraphs"]: + context = p["context"] + context_len = len(context) + num = 0 + for qa in p["qas"]: + quest = qa["question"] + quest_len = len(quest) + if (context_len+quest_len < 509): + question.append(quest) + if (len(qa["answers"])) != 0: + an = [] + for i in qa["answers"]: + an.append(i["text"]) + answer.append(most_frequent(an)) + + else: + answer.append("") + if qa["is_impossible"]: + is_impossible.append(1) + else: + is_impossible.append(0) + ids.append(qa["id"]) + num += 1 + if (num != 0): + text.append(context) + num_questions.append(num) + return question, text, answer, is_impossible, ids, num_questions + + +def process_data(question, text, num_questions): + """ + Adds CLS, and SEP tokens to input text + """ + input_text = [] + for i in range(len(num_questions)): + for j in range(num_questions[i]): + input_text.append( "[CLS] " + question[j] + " [SEP] " + text[i] + " [SEP]") + return input_text + + +def get_cls_from_input(input_text, is_impossible, emb_type = 0): + """ + Function that returns CLS embeddings for each entry in input_text + """ + #BERT model and tokenizer + tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + model = BertModel.from_pretrained('bert-base-uncased') + input_ids = [] #holds all the token id encodings for each input + features = np.zeros((len(input_text), 768)) + max_len = 0 + for text in input_text: + input_id = tokenizer.encode(text) + input_ids.append(input_id) + if len(input_id) > max_len: + max_len = len(input_id) + + is_impossible.append(max_len) + is_impossible = np.array(is_impossible) + + with torch.no_grad(): + for i in range(len(input_ids)): + input = input_ids[i] + padded = np.array(input + [0]*(max_len - len(input))) + attention_mask = np.where(padded != 0, 1, 0) + input = torch.tensor([padded]).type(torch.LongTensor) + attention_mask = torch.tensor([attention_mask]) + last_hidden_states = model(input, attention_mask = attention_mask) + + if (emb_type == 0): #cls token + features[i] = (last_hidden_states[0][:,0,:].numpy()) + + elif(emb_type == 1): #avg of work tokens + token_vecs = last_hidden_states[0][0] + features[i] = torch.mean(token_vecs, dim=0) + + + return features, is_impossible + + +def get_training_set(data_url = SQUAD2_TRAIN_URL, emb_type = 0): + """ + Returns the CLS tokens from passing the SQUAD 2.0 training set into BERT and labels + emb_type: 0 if using cls token emb + 1 if using avg token emb + """ + cwd = os.getcwd() + csv_name_input = os.path.join(cwd, CLS_EMBS_DATASET_INPUT ) if emb_type == 0 else os.path.join(cwd, AVG_EMBS_DATASET_INPUT) + csv_name_labels = os.path.join(cwd, EMBS_DATASET_LABELS ) + + #processing squad 2.0 data + question, text, answer, is_impossible, ids, num_questions = process_json(data_url) + #format the test to be input into BERT + input_text = process_data(question, text, num_questions) + + #check if the files exist locally + if os.path.exists(csv_name_input) and os.path.exists(csv_name_labels): + print("Found Training Set Locally") + labels = np.load(csv_name_labels) + inputs = np.load(csv_name_input) + max_len = labels[-1] + labels = labels[:-1] + return inputs, labels, input_text, max_len + + else: + features, is_impossible = get_cls_from_input(input_text, is_impossible, emb_type) + + np.save(csv_name_input, features) + np.save(csv_name_labels, is_impossible) + print(features.shape, is_impossible.shape) + max_len = is_impossible[-1] + is_impossible = is_impossible[:-1] + + return features, is_impossible, input_text, max_len + + +def get_testing_set(data_path, emb_type=0): + """Returns the CLS tokens embeddings from passing the CS3110 data set into BERT and labels. + """ + cwd = os.getcwd() + csv_name_input = os.path.join(cwd,TEST_EMB_DATASET_INPUT ) + csv_name_labels = os.path.join(cwd,TEST_EMB_DATASET_LABELS ) + + #process text + question, text, answer, test_labels, ids, is_impossible, num_questions = [], [], [], [], [], [], [] + with open(data_path) as f: + data = json.load(f)["data"] + + for d in data: + question.append(d["question"]) + text.append(d["context"]) + answer.append(d["answer"]) + #is_impossible label is 1 if it's not answerable and 0 if it is answerable + is_impossible.append(1 if d["is_impossible"] else 0) + ids.append(d["id"]) + num_questions.append(1) + + #format the text to be passed into BERT + input_text = process_data(question, text, num_questions) + + if os.path.exists(csv_name_input) and os.path.exists(csv_name_labels): + print("Found Testing Set Locally") + labels = np.load(csv_name_labels) + inputs = np.load(csv_name_input) + max_len = labels[-1] + labels = labels[:-1] + return inputs, labels, input_text, max_len + + else: + inputs, labels = get_cls_from_input(input_text, is_impossible, emb_type) + np.save(csv_name_input, inputs) + np.save(csv_name_labels, labels) + max_len = labels[-1] + labels = labels[:-1] + return inputs, labels, input_text, max_len + +#################################################################################################### +################### Functions For Answerability Classification ################## + +def logisticReg(inputs, labels, test_input, test_labels): + """Returns the best performing logistic regression model. + Function that trains a logistic regression classification model on given training data + and then evaluated it on given test data. + """ + #establishing ranges for parameters + perf = 0 + best_c = .001 + best_log = LogisticRegression(C=1) + c_param_range = [0.001,0.01,0.1,1,10,100] + for i in c_param_range: + log_model = LogisticRegression(C=i) + log_model.fit(inputs,labels) + #evaluate + y = log_model.predict(inputs) + #implement confusion matrix + other metrics + score = log_model.score(inputs, labels) + if (perf < score): + best_c = i + perf = score + best_log = log_model + print("Best C: ", best_c, " Score: ", perf) + train_preds = best_log.predict(inputs) + cm_train = metrics.confusion_matrix(labels, train_preds) + print("Confusion matrix training \n", cm_train) + + + #test on 3110 data + print("Testing on 3110 Dataset") + test_preds = best_log.predict(test_input) + cm = metrics.confusion_matrix(test_labels, test_preds) + print("Confusion Matrix Testing\n", cm) + print("Accuracy", best_log.score(test_input, test_labels)) + return best_log + + +def predictions(input_text, print_some_outputs = True): + tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad') + model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad') + input_ids = [] + token_type_ids = [] + preds = [] + print("training") + log_model, max_len= logisticReg() + print("lets train") + for text in input_text: + input_log = tokenizer_bert.encode(text) + input_log = input_log + [0] * (max_len - len(input_log)) + attention_mask = [0 if i == 0 else 1 + for i in input_log] + input_logs = torch.tensor([input_log]) + print(input_logs.size) + attention_mask = torch.tensor([attention_mask]) + print(attention_mask.size) + with torch.no_grad(): + last_hidden_states = model_bert(input_logs,attention_mask = attention_mask) + print(last_hidden_states) + features = last_hidden_states[0][:,0,:].numpy() + print(features) + pred = log_model.predict(features) + if pred > 0: + preds.append("") + else: + input_ids = tokenizer.encode(text) + token_type_ids = [0 if i <= input_ids.index(102) else 1 + for i in range(len(input_ids))] + start_scores, end_scores = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([token_type_ids])) + all_tokens = tokenizer.convert_ids_to_tokens(input_ids) + preds.append((all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1])) + return preds + + +#returns the precision, recall, and f1 score +#preds and labels should be tokenized +def evaluate(preds, labels, questions, ids): + print(len(preds)) + print(len(labels)) + tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad') + + labels_tok = [] + for l in labels: + encoded = tokenizer.encode(l) + to_tok = tokenizer.convert_ids_to_tokens(encoded) + labels_tok.append(to_tok[1:-1]) + + n = len(labels) + precision = 0 + recall = 0 + f1 = 0 + + for i in range(len(labels_tok)): + + p = preds[i] + l = labels_tok[i] + + if len(p) == 0 or len(l) == 0: + agree = 1 if len(p) == len(l) else 0 + precision += 1 + recall += 1 + f1 += 1 + + else: + common_toks = Counter(p) & Counter(l) + intersection = 1.0 * sum(common_toks.values()) + #calculate precision + pr = intersection/ len(p) + re = intersection / len(l) + precision += pr + #calculate recall + recall += re + #calculate f1 + f1 += 0 if (pr == 0 or re == 0) else (2*pr*re)/(pr+re) + if (pr == 0 or re == 0): + + print("\nBad answer example ",ids[i], ': ', questions[i]) + print("Prediction: ", ' '.join(p)) + print("Answer: ", ' '.join(l)) + + #average over all samples + precision = precision/n + recall = recall/n + f1 = f1/n + + return precision, recall, f1 + + +def main(data_path, eb=0): + + print('Creating Testing Dataset') + testing_x, testing_y, input_3110, max_len_test = get_testing_set(data_path, emb_type = eb) + print("Creating Training Dataset") + training_x, training_y, input_squad, max_len_train = get_training_set(emb_type = eb) + + + imp_percent_test = np.sum(testing_y)/len(testing_y) + imp_percent_train = np.sum(training_y)/len(training_y) + print("% Impossible in train vs test: ", imp_percent_train, imp_percent_test ) + + print("\nUsing Logistic Regression") + lr_cls = logisticReg(training_x, training_y, testing_x, testing_y) + + + print("\nStarting predictions") + #preds = predictions(input_text) + + print("\nEvaluating predictions") + # p, r, f1 = evaluate(preds, answers, question, ids) + + #print some stats + print('\nEvaluation Stats are: ') + print('\tPrecision: ', p) + print('\tRecall: ', r) + print('\tF1 score: ', f1) + +if __name__ == '__main__': + args = passed_arguments() + data_path = args.data_path + emb_type = args.emb_type + main(data_path, emb_type) diff --git a/q_and_a/unanswerable.py b/q_and_a/unanswerable.py new file mode 100644 index 0000000..1d1053f --- /dev/null +++ b/q_and_a/unanswerable.py @@ -0,0 +1,314 @@ +import torch +from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler +from tqdm import tqdm, trange +import pandas as pd +import io +import json +import os +import numpy as np +import random +from transformers import ( + BertTokenizer, + BertForQuestionAnswering, + BertModel, + #AutoTokenizer, + #AutoModelForQuestionAnswering +) + +import argparse +from collections import Counter + +from sklearn.linear_model import LogisticRegression +import requests + + +def passed_arguments(): + parser = argparse.ArgumentParser(description="Script to evaluate model predictions.") + #parser.add_argument("--model", + # type=int, + # required=True, + # help="Baseline model to test on. \n0 = BERT\n1=DistilBERT") + + parser.add_argument("--data_path", + type=str, + required=True, + help="Path to evaluation dataset") + + #parser.add_argument("--impossible_on", + # type=int, + # required=True, + # help="0: no impossible questions\n1: impossible questions") + + args = parser.parse_args() + return args + + +#helper returning the most frequent of a an array: +def most_frequent(List): + counter = 0 + num = List[0] + for i in List: + curr_frequency = List.count(i) + if(curr_frequency> counter): + counter = curr_frequency + num = i + return num + +# 1 when is_impossible, else 0 +def process_json(data_path): + print("starting yay") + question = [] + text = [] + answer = [] + is_impossible = [] + ids = [] + num_questions = [] # number of questions asked on the same context + + r = requests.get(url = data_path) + data = r.json()["data"] + data_len = len(data) + for d in data: + for p in d["paragraphs"]: + context = p["context"] + context_len = len(context) + num = 0 + for qa in p["qas"]: + quest = qa["question"] + quest_len = len(quest) + if (context_len+quest_len < 509): + question.append(quest) + if (len(qa["answers"])) != 0: + an = [] + for i in qa["answers"]: + an.append(i["text"]) + answer.append(most_frequent(an)) + + else: + answer.append("") + if qa["is_impossible"]: + is_impossible.append(1) + else: + is_impossible.append(0) + ids.append(qa["id"]) + num += 1 + if (num != 0): + text.append(context) + num_questions.append(num) + print("done") + return question, text, answer, is_impossible, ids, num_questions + + +def process_data(question, text, num_questions): + print("oho") + input_text = [] + for i in range(len(num_questions)): + for j in range(num_questions[i]): + input_text.append( "[CLS] " + question[j] + " [SEP] " + text[i] + " [SEP]") + return input_text + +#return the max_len, is_impossible, and CLS token of SQUAD dataset. Store the data in CSV file +def get_CLS(percentage): + question, text, answer, is_impossible, ids, num_questions = process_json("https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json") + input_text = process_data(question, text, num_questions) + print("done") + input_text = input_text[int(percentage*len(input_text)):] + is_impossible = is_impossible[int(percentage*len(is_impossible)):] + + print(len(input_text)) + tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + model = BertModel.from_pretrained('bert-base-uncased') + print("working") + input_ids = [] + max_len = 0 + for text in input_text: + input_id = tokenizer.encode(text) + input_ids.append(input_id) + if len(input_id) > max_len: + max_len = len(input_id) + + #append the maximum length of the sentence to the list of is_impossible + is_impossible.append(max_len) + is_impossible.to_csv("is_impossible.csv", encoding='utf-8', index=False) + + padded = np.array([i + [0]*(max_len - len(i)) for i in input_ids]) + attention_mask = np.where(padded != 0, 1, 0) + input_ids = torch.tensor(padded) + print(input_ids.size) + attention_mask = torch.tensor(attention_mask) + print(attention_mask.size) + with torch.no_grad(): + last_hidden_states = model(input_ids, attention_mask = attention_mask) + print(last_hidden_states) + features = pd.DataFrame(last_hidden_states[0][:,0,:].numpy()) + print(features) + features.to_csv("features.csv", encoding='utf-8', index=False) + + return features, is_impossible, max_len + + +#pretrained logistic regression +def logisticReg(): + question, text, answer, is_impossible, ids, num_questions = process_json("https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json") + input_text = process_data(question, text, num_questions) + print("done") + input_text = input_text[int(.99*len(input_text)):] + is_impossible = is_impossible[int(.99*len(is_impossible)):] + is_impossible.to_csv("is_impossible.csv", encoding='utf-8', index=False) + print(len(input_text)) + tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + model = BertModel.from_pretrained('bert-base-uncased') + print("working") + input_ids = [] + max_len = 0 + for text in input_text: + input_id = tokenizer.encode(text) + input_ids.append(input_id) + if len(input_id) > max_len: + max_len = len(input_id) + padded = np.array([i + [0]*(max_len - len(i)) for i in input_ids]) + attention_mask = np.where(padded != 0, 1, 0) + input_ids = torch.tensor(padded) + print(input_ids.size) + attention_mask = torch.tensor(attention_mask) + print(attention_mask.size) + with torch.no_grad(): + last_hidden_states = model(input_ids, attention_mask = attention_mask) + print(last_hidden_states) + features = last_hidden_states[0][:,0,:].numpy() + print(features) + features.to_csv("features.csv", encoding='utf-8', index=False) + log_model = LogisticRegression() + log_model.fit(features,is_impossible) + print("finished") + return log_model, max_len + + + +#pre-trained bert runs on evaluation sets +#return list of tokens for each question id +def predictions(input_text, print_some_outputs = True): + tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad') + model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad') + tokenizer_bert = BertTokenizer.from_pretrained('bert-base-uncased') + model_bert = BertModel.from_pretrained('bert-base-uncased') + input_ids = [] + token_type_ids = [] + preds = [] + f = [] + print("training") + log_model, max_len= logisticReg() + print("lets train") + for text in input_text: + input_log = tokenizer_bert.encode(text) + input_log = input_log + [0] * (max_len - len(input_log)) + attention_mask = [0 if i == 0 else 1 + for i in input_log] + input_logs = torch.tensor([input_log]) + print(input_logs.size) + attention_mask = torch.tensor([attention_mask]) + print(attention_mask.size) + with torch.no_grad(): + last_hidden_states = model_bert(input_logs,attention_mask = attention_mask) + print(last_hidden_states) + features = last_hidden_states[0][:,0,:].numpy() + print(features) + f.append(features) + pred = log_model.predict(features) + if pred > 0: + preds.append("") + else: + input_ids = tokenizer.encode(text) + token_type_ids = [0 if i <= input_ids.index(102) else 1 + for i in range(len(input_ids))] + start_scores, end_scores = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([token_type_ids])) + all_tokens = tokenizer.convert_ids_to_tokens(input_ids) + preds.append((all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1])) + f.to_csv("features_pred.csv", encoding='utf-8', index=False) + return preds + + +#returns the precision, recall, and f1 score +#preds and labels should be tokenized +def evaluate(preds, labels, questions, ids): + print(len(preds)) + print(len(labels)) + tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad') + + labels_tok = [] + for l in labels: + encoded = tokenizer.encode(l) + to_tok = tokenizer.convert_ids_to_tokens(encoded) + labels_tok.append(to_tok[1:-1]) + + n = len(labels) + precision = 0 + recall = 0 + f1 = 0 + + for i in range(len(labels_tok)): + + p = preds[i] + l = labels_tok[i] + + if len(p) == 0 or len(l) == 0: + agree = 1 if len(p) == len(l) else 0 + precision += 1 + recall += 1 + f1 += 1 + + else: + common_toks = Counter(p) & Counter(l) + intersection = 1.0 * sum(common_toks.values()) + #calculate precision + pr = intersection/ len(p) + re = intersection / len(l) + precision += pr + #calculate recall + recall += re + #calculate f1 + f1 += 0 if (pr == 0 or re == 0) else (2*pr*re)/(pr+re) + if (pr == 0 or re == 0): + + print("\nBad answer example ",ids[i], ': ', questions[i]) + print("Prediction: ", ' '.join(p)) + print("Answer: ", ' '.join(l)) + + #average over all samples + precision = precision/n + recall = recall/n + f1 = f1/n + + return precision, recall, f1 + + + +def main(data_path): + print('Starting baseline evaluation\n') + + question, text, labels, is_impossible, ids, num_questions = process_json(data_path) + + print(len(question)) + print(len(labels)) + + input_text = process_data(question, text, num_questions) + print(len(input_text)) + + + print("Starting predictions") + preds = predictions(input_text) + + print("Evaluating predictions") + p, r, f1 = evaluate(preds, labels, question, ids) + + #print some stats + print('\nEvaluation Stats are: ') + print('\tPrecision: ', p) + print('\tRecall: ', r) + print('\tF1 score: ', f1) + +if __name__ == '__main__': + args = passed_arguments() + #model_id = args.model + data_path = args.data_path + #imp_toggle = args.impossible_on + main(data_path) \ No newline at end of file diff --git a/q_and_a/unanswerable2.py b/q_and_a/unanswerable2.py new file mode 100644 index 0000000..6c5c0bb --- /dev/null +++ b/q_and_a/unanswerable2.py @@ -0,0 +1,178 @@ +import torch +from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler, Dataset,SubsetRandomSampler +from tqdm import tqdm, trange +import pandas as pd +import io +import json +import os +import numpy as np +import random +from transformers import ( + BertTokenizer, + BertForQuestionAnswering, + BertModel, + #AutoTokenizer, + #AutoModelForQuestionAnswering +) +import torch.nn as nn +import torch.optim as optim +import numpy as np +from torchtext import data +import pandas as pd +import re +import argparse +from collections import Counter +from sklearn.linear_model import LogisticRegression +import requests + +class BertBinaryClassifier(nn.Module): + def __init__(self, dropout=0.1): + super(BertBinaryClassifier, self).__init__() + self.bert = BertModel.from_pretrained('bert-base-uncased') + self.l1 = nn.Linear(768, 512) + self.l2 = nn.Linear(512, 1) + self.sigmoid = nn.Sigmoid() + + + def forward(self, tokens, mask): + _, pooled_output = self.bert(tokens) + linear_output = self.l2(self.l1(pooled_output)) + proba = self.sigmoid(linear_output) + return proba + +#helper returning the most frequent of a an array: +def most_frequent(List): + counter = 0 + num = List[0] + for i in List: + curr_frequency = List.count(i) + if(curr_frequency> counter): + counter = curr_frequency + num = i + return num + +# 1 when is_impossible, else 0 +def process_json(data_path): + print("starting yay") + question = [] + text = [] + answer = [] + is_impossible = [] + ids = [] + num_questions = [] # number of questions asked on the same context + + r = requests.get(url = data_path) + data = r.json()["data"] + data_len = len(data) + for d in data: + for p in d["paragraphs"]: + context = p["context"] + context_len = len(context) + num = 0 + for qa in p["qas"]: + quest = qa["question"] + quest_len = len(quest) + if (context_len+quest_len < 509): + question.append(quest) + if (len(qa["answers"])) != 0: + an = [] + for i in qa["answers"]: + an.append(i["text"]) + answer.append(most_frequent(an)) + + else: + answer.append("") + if qa["is_impossible"]: + is_impossible.append(1) + else: + is_impossible.append(0) + ids.append(qa["id"]) + num += 1 + if (num != 0): + text.append(context) + num_questions.append(num) + print("done") + return question, text, answer, is_impossible, ids, num_questions + + +def process_data(question, text, num_questions): + print("oho") + input_text = [] + for i in range(len(num_questions)): + for j in range(num_questions[i]): + input_text.append( "[CLS] " + question[j] + " [SEP] " + text[i] + " [SEP]") + return input_text + + +def FC(percentage, EPOCHS, BATCH_SIZE): + question, text, answer, is_impossible, ids, num_questions = process_json("https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json") + input_text = process_data(question, text, num_questions) + print("done") + input_text = input_text[int(percentage*len(input_text)):] + is_impossible = is_impossible[int(percentage*len(is_impossible)):] + + print(len(input_text)) + tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + bert_clf = BertBinaryClassifier() + print("working") + input_ids = [] + max_len = 0 + for text in input_text: + input_id = tokenizer.encode(text) + input_ids.append(input_id) + if len(input_id) > max_len: + max_len = len(input_id) + + padded = np.array([i + [0]*(max_len - len(i)) for i in input_ids]) + attention_mask = np.where(padded != 0, 1, 0) + input_ids = torch.tensor(padded) + print(input_ids.size) + attention_mask = torch.tensor(attention_mask) + print(attention_mask.size) + + train_tokens_tensor = input_ids[:int(0.8*len(input_ids))] + train_y_tensor = torch.tensor(is_impossible[:int(0.8*len(input_ids))]).float().reshape(-1,1) + test_tokens_tensor = input_ids[int(0.8*len(input_ids)):] + test_y_tensor = torch.tensor(is_impossible[int(0.8*len(input_ids)):]).float().reshape(-1,1) + + train_dataset = TensorDataset(train_tokens_tensor, train_y_tensor) + train_sampler = RandomSampler(train_dataset) + train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=BATCH_SIZE) + test_dataset = TensorDataset(test_tokens_tensor, test_y_tensor) + test_sampler = SequentialSampler(test_dataset) + test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=BATCH_SIZE) + + optimizer = optim.Adam(bert_clf.parameters(), lr=3e-6) + loss_fn = nn.BCELoss() + + bert_clf.train() + print('Training model...') + for epoch_num in range(EPOCHS): + for step_num, (token_ids, labels) in enumerate(train_dataloader): + probas = bert_clf(token_ids) + batch_loss = loss_fn(probas, labels) + bert_clf.zero_grad() + batch_loss.backward() + optimizer.step() + print("epoch_num = " + epoch_num + " step_num = " + step_num) + print('Training Completed') + torch.save(bert_clf.state_dict(), "model.pt") + + bert_clf.eval() + print('Evaluating model...') + correct = 0 + for batch_index, (input_t, y) in enumerate(test_dataloader): + preds = bert_clf(input_t) + p = preds.reshape(-1).detach().numpy().round() + y1 = y.reshape(-1).detach().numpy() + for i in range(len(p)): + if p[i] == y1[i]: + correct = correct + 1 + loss = loss_fn(preds, y) + print(f"Loss: {loss.detach()}") + + print("Accuracy={}".format(correct/(0.2*len(input_ids)))) + + +if __name__ == '__main__': + FC(0.95, 5, 32) \ No newline at end of file