oscarso2000 · MelindaFang-code · Dec 2, 2020 · Mar 4, 2021 · Mar 11, 2021 · Mar 11, 2021
diff --git a/q_and_a/.dockerignore b/q_and_a/.dockerignore
@@ -0,0 +1,2 @@
+__pycache__
+venv
diff --git a/q_and_a/answer_selection.py b/q_and_a/answer_selection.py
@@ -0,0 +1,155 @@
+#import
+import torch
+from torch.nn.functional import softmax
+from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
+from tqdm import tqdm, trange
+import io
+import json
+import os
+import numpy as np
+from transformers import (
+    BertTokenizer,
+    BertForQuestionAnswering,
+)
+import argparse
+import requests
+
+
+#argments
+def passed_arguments():
+	parser = argparse.ArgumentParser(description="Script to evaluate model predictions.")
+
+	parser.add_argument("--data_path",
+											type=str,
+											required=True,
+											help="Path to evaluation dataset")
+	args = parser.parse_args()
+	return args
+
+    parser.add_argument("--top_n",
+											type=int,
+											required=False,
+                                            default=1,
+											help="Top n results to consider correct")
+	args = parser.parse_args()
+	return args
+
+    parser.add_argument("--propose_cnt",
+											type=int,
+											required=False,
+                                            default=10,
+											help="Number of contexts to test question on")
+	args = parser.parse_args()
+	return args
+
+
+def wrap_select(a, i, n):
+    """
+    a - list to select from
+    i - index of a to start from
+    n - number of things to select
+    """
+    b = []
+
+    #true answer as first possition
+    b.append(a[i])
+    j = (i+1)%len(a)
+    while len(b) < n+1 :
+        if a[j] not in b:
+            b.append(a[(j)])
+        j = (j+1)%len(a)
+    return b
+
+
+def process_data_mult_text(question, contexts):
+    """
+    Adds CLS, and SEP tokens to input text
+    Run through tokenizer 
+    Returns tokenized
+    """ 
+    tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
+
+    input_text = []
+    input_ids = []
+    token_type_ids = []
+    for i in range(len(contexts)):
+        text =  "[CLS] " + question + " [SEP] " + contexts[i] + " [SEP]"
+        input_text.append(text)
+
+        encoded = tokenizer.encode(text)
+        input_ids.append(encoded)
+
+        token_type_id = [0 if i <= encoded.index(102) else 1 ]
+        token_type_ids.append(token_type_id)
+
+    return input_text, input_ids, token_type_ids
+
+
+def get_3110_set(data_path, include_impossible = False):
+    """Returns the 3110 dataset data.
+    """
+    cwd = os.getcwd()
+
+    question, text, answer, labels, ids, is_impossible = [], [], [], [], [], []
+    with open(data_path) as f:
+        data = json.load(f)["data"]
+
+    for d in data:
+        if (include_impossible or d["is_impossible"]==0 ):
+            is_impossible.append(1 if d["is_impossible"] else 0)
+            question.append(d["question"])
+            text.append(d["context"])
+            answer.append(d["answer"])
+            ids.append(d["id"])
+
+    return question, text, answer, is_impossible
+
+
+
+def evaluate_on_multiple_context(questions, contexts, answers, eval_on = 10, top_n = 1):
+
+
+    model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
+    results = np.zeros(len(questions))
+
+    for i,q in enumerate(questions):
+
+        #get possible passages to test question on
+        context_sub = wrap_select(contexts, i, eval_on)
+
+        #format questions and context for bert input
+        input_text, input_ids, token_type_ids = process_data_mult_text(q, context_sub)
+        scores_avg = np.zeros(eval_on)
+
+        #pass input through bert saving avg of max value in logits
+        for j in range (eval_on):
+            start_scores, end_scores = model(torch.tensor([input_ids[j]]), token_type_ids=torch.tensor([token_type_ids[j]]), return_dict=False)   
+            soft_start = softmax(start_scores)
+            soft_end = softmax(end_scores)
+            scores_avg[j] = (torch.max(soft_start) + torch.max(soft_end))/2         
+
+        arr = scores_avg.argsort()[-top_n:][::-1]
+        results[i] = 1 if 0 in arr else 0
+
+
+    return results
+
+
+def main(data_path, top_n, propose_cnt):
+
+    print('Getting data')
+    questions, contexts, answers, is_impossible = get_3110_set(data_path)
+
+    print("Beginning Evaluation")
+    results = evaluate_on_multiple_context(questions, contexts, answers, eval_on=p_cnt, top_n=top_n)
+    print("Finished Evaluation")
+    print("Acc: ", np.sum(results)/len(results))
+
+
+if __name__ == '__main__':
+    args = passed_arguments()
+    data_path = args.data_path
+    top_n = args.top_n
+    p_cnt = arg.propose_cnt
+
+    main(data_path, top_n, p_cnt)
diff --git a/q_and_a/app.py b/q_and_a/app.py
@@ -0,0 +1,195 @@
+from flask import Flask, request, render_template, make_response
+from haystack.document_stores import FAISSDocumentStore
+from haystack.nodes import PDFToTextConverter, PreProcessor, FARMReader, DensePassageRetriever
+from haystack.pipelines import ExtractiveQAPipeline
+from haystack.schema import Document
+from pipeline import answer
+import pymysql
+import json
+import os
+
+app = Flask(__name__)
+
+app.config['SQLALCHEMY_DATABASE_URI'] = 'mysql+pymysql://admin_mci:mycourseindex-qa@database-qa.cp4ury9dboly.us-east-1.rds.amazonaws.com/qa_docstore'
+# app.config["input"] = "/usr/src/app/data/input"
+app.config["input"] = "./data/input"
+app.config["host"] = "0.0.0.0"
+
+def create_dpr(document_store):
+    retriever = DensePassageRetriever(
+    document_store=document_store,
+    query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
+    passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
+    max_seq_len_query=64,
+    max_seq_len_passage=256,
+    batch_size=16,
+    use_gpu=True,
+    embed_title=True,
+    use_fast_tokenizers=True,
+    )
+    # document_store.update_embeddings(retriever)
+    # document_store.save(index_path="haystack_test_faiss", config_path="haystack_test_faiss_config")
+    # reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True, progress_bar=False, top_k_per_candidate=2)
+
+    return retriever
+
+def joinParagraph(str):
+    s = str.replace('\n', ' ').replace('*', '').replace('%temp%', 'e').replace('```{code-cell} ocaml', '').replace('\ ', '').replace('`', '')
+    return re.sub('\\\s',' ', s)
+
+@app.route("/query_pipe",methods=['POST'])
+def query_pipe():
+    q=request.form['question']
+    len_ans=int(request.form['lenans'])
+    len_retriever=int(request.form['lenretr'])
+    prediction = pipe.run(query=q, params={"Retriever": {"top_k": len_retriever}, "Reader": {"top_k": len_ans}})
+    doc_ids = [prediction['answers'][i].document_id for i in range(len_ans)]
+    docs = document_store.get_documents_by_id(doc_ids)
+    docs = [d.content for d in docs]
+    ans = [prediction['answers'][i].answer for i in range(len_ans)]
+    return json.dumps({
+        'status':'success',
+        'message': 'Process succesfully', 
+        'result': ans,
+        'context': docs})
+
+@app.route("/query",methods=['POST'])
+def query():
+    q=request.form['question']
+    len_ans=int(request.form['lenans'])
+    len_retriever=int(request.form['lenretr'])
+    context = retriever.retrieve(query=q, top_k=len_retriever)
+    # prediction = pipe.run(query=q, params={"Retriever": {"top_k": len_retriever}, "Reader": {"top_k": len_ans}})
+    ans = answer(1, context, q)
+    return json.dumps({'status':'success','message': 'Process succesfully', 'result': ans})
+
+@app.route('/')
+def home():
+    """Return a friendly HTTP greeting."""
+    return 'Hello QNA API is running'
+
+#endpoint to update embedded method
+@app.route('/set_embed', methods=['POST'])
+def set_embed():
+    """Return a friendly HTTP greeting."""
+    # document_store.write_documents()
+    document_store.update_embeddings(retriever, update_existing_embeddings=False)
+    document_store.save("haystack_test_faiss", "haystack_test_faiss_config")
+    return json.dumps({'status':'Susccess','message': 'Sucessfully embeded method updated in FAISS Document', 'result': document_store.get_embedding_count()})
+
+@app.route('/get_docs')
+def get_docs():
+    """Return a friendly HTTP greeting."""
+    # document_store.write_documents()
+    res=document_store.get_all_documents()[0].content
+    return json.dumps({'status':'Susccess','message': 'Sucessfully embeded method updated in FAISS Document', 'result': res})
+
+
+@app.route('/update_docstore_pdf', methods=['POST'])
+def update_document():
+    """Return a the url of the index document."""
+    if request.files:
+        prev = document_store.get_document_count()
+        # uploaded document for target source
+        doc = request.files["doc"]
+        file_path = os.path.join(app.config["input"], doc.filename)
+        # saving the file to the input directory
+        doc.save(file_path)
+        # convert the pdf files into dictionary and update to Document
+        converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=["en"])
+        # doc_pdf = converter.convert(file_path=Path(f"{doc_dir}/3110.pdf"), meta=None)[0]
+        doc_pdf = converter.convert(file_path=filepath, meta=None)[0]
+        doc_pdf.content = join_Paragraph(doc_pdf.content)
+
+        preprocessor = PreProcessor(
+        clean_empty_lines=True,
+        clean_whitespace=True,
+        clean_header_footer=False,
+        split_by="word",
+        split_length=100,
+        split_respect_sentence_boundary=True,
+        )
+        docs_default = preprocessor.process(doc_pdf)
+
+        document_store.write_documents(docs_default)
+        now = document_store.get_document_count()
+        # os.remove(file_path)
+
+        return json.dumps(
+            {'status':'Susccess', 'result': {'file': doc.filename, 'prevcount': prev, "currcount": now}})
+    else:
+        return json.dumps({'status':'Failed','message': 'No file uploaded', 'result': []})
+
+@app.route('/update_docstore_json', methods=['POST'])
+def update_json():
+    # doc.seek(0)
+    # contents = doc.read()
+    prev = document_store.get_document_count()
+    filename = request.form['filename']
+    file_path = os.path.join(app.config["input"], filename)
+    with open(str(file_path), 'r') as j:
+        contents = json.load(j)
+
+    contentlists=[]
+    for x in contents:
+        # print(x.keys())
+        ans = [i['content'] for i in x['_source']['answers']]
+        # x.clear()
+        # x['content'] = x['_source']['content'] + '.'.join(ans)
+        contentlists.append({'content': x['_source']['content'] + '.'.join(ans)})
+
+    field_map = {"content": "content"}
+    contentlists = [Document.from_dict(d) for d in contentlists]
+
+    document_store.write_documents(contentlists)
+    os.remove(file_path)
+    now = document_store.get_document_count()
+
+    return json.dumps(
+            {'status':'Susccess', 'result': {'file': filename, 'prevcount': prev, "currcount": now}})
+    # else:
+    #     return json.dumps({'status':'Failed','message': 'No file uploaded', 'result': []})
+
+
+@app.route('/upload_document', methods=['POST'])
+def upload_document():
+    """Return a the url of the index document."""
+    if request.files:
+        # uploaded document for target source
+        prev = document_store.get_document_count()
+        doc = request.files["doc"]
+        file_path = os.path.join(app.config["input"], doc.filename)
+        # saving the file to the input directory
+        doc.save(file_path)
+        # with open(str(file_path), 'r') as j:
+        # doc.seek(0)
+        # contents = doc.read()
+        # for x in contents:
+        #     ans = [i['content'] for i in x['_source']['answers']]
+        #     x['content'] = x['_source']['content'] + '.'.join(ans)
+        # field_map = {"content": "content"}
+        # contentlists = [Document.from_dict(d) for d in content]
+
+        # document_store.write_documents(contentlists)
+        # # os.remove(file_path)
+        now = document_store.get_document_count()
+
+        return json.dumps(
+            {'status':'Susccess', 'result': {'file': doc.filename, 'prevcount': prev, "currcount": now}})
+    else:
+        return json.dumps({'status':'Failed','message': 'No file uploaded', 'result': []})
+
+if __name__ == '__main__':
+    port = int(os.environ.get("PORT", 5000))
+# document_store = ElasticsearchDocumentStore(
+#     host = cluster_ip,
+#     scheme="https",
+#     index='cs_4780_sp2021',
+#     username="mciesaccess",
+#     password="mcioscar"
+# )
+    document_store = FAISSDocumentStore.load(index_path="haystack_test_faiss", config_path="haystack_test_faiss_config")
+    retriever = create_dpr(document_store)
+    reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True, progress_bar=True, return_no_answer=True)
+    pipe = ExtractiveQAPipeline(reader, retriever)
+    app.run(host=app.config["host"], port=port, debug=True)