Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
e8eb98a
small documentation changes
Dec 2, 2020
ab04fdb
tensor 3
Mar 4, 2021
469df22
FC implementatio with 63% accuracy
Mar 11, 2021
9123661
separate file for creating CLS token
Mar 11, 2021
d3c13fe
combine saving CLS with log regression
Mar 11, 2021
ba1bfef
small documentation changes
Dec 2, 2020
fdd6d23
tensor 3
Mar 4, 2021
2049bcc
FC implementatio with 63% accuracy
Mar 11, 2021
74e83d1
separate file for creating CLS token
Mar 11, 2021
6132f94
combine saving CLS with log regression
Mar 11, 2021
fc86084
delete old file, finished data processing
Mar 18, 2021
4e7b021
removed redundant files
Mar 18, 2021
1b8fa2b
avg token emb
Apr 9, 2021
58f17c1
answer selection + model calibration testing
Apr 21, 2021
c8ba505
first commit for code review
MelindaFang-code Mar 30, 2022
16cf921
first commit for code review
MelindaFang-code Mar 30, 2022
9b4701a
Merge branch 'q_and_a' of https://github.com/MelindaFang-code/MyCours…
MelindaFang-code Mar 31, 2022
6f5bc4e
combine saving CLS with log regression
Mar 11, 2021
83e9728
Merge branch 'q_and_a' of https://github.com/MelindaFang-code/MyCours…
MelindaFang-code Mar 31, 2022
175d52d
changes to develop local ml pipeline app
MelindaFang-code Apr 23, 2022
1724487
update current progress, need to use intel chip mac to run
MelindaFang-code May 1, 2022
97859b3
commit changes
MelindaFang-code May 1, 2022
cdad729
Merge branch 'q_and_a' of https://github.com/oscarso2000/MyCourseInde…
MelindaFang-code May 1, 2022
b97fc66
Merge branch 'q_and_a' into q_and_a
MelindaFang-code May 1, 2022
1c28511
Merge pull request #191 from MelindaFang-code/q_and_a
MelindaFang-code May 1, 2022
d9f262a
change to app.py
MelindaFang-code May 2, 2022
74996b1
sql
MelindaFang-code May 5, 2022
1a65c28
delstuff
MelindaFang-code May 5, 2022
dcb0c60
delete
MelindaFang-code May 5, 2022
c3cb4b3
no circular import
MelindaFang-code May 5, 2022
38c4b81
update app.py solve keep download issue
MelindaFang-code May 5, 2022
83bfef5
update requirements
MelindaFang-code May 5, 2022
d08d03d
final success version of qa
MelindaFang-code May 6, 2022
81e3b7b
finally working qa app
MelindaFang-code May 8, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions q_and_a/.dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
__pycache__
venv
155 changes: 155 additions & 0 deletions q_and_a/answer_selection.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
#import
import torch
from torch.nn.functional import softmax
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from tqdm import tqdm, trange
import io
import json
import os
import numpy as np
from transformers import (
BertTokenizer,
BertForQuestionAnswering,
)
import argparse
import requests


#argments
def passed_arguments():
parser = argparse.ArgumentParser(description="Script to evaluate model predictions.")

parser.add_argument("--data_path",
type=str,
required=True,
help="Path to evaluation dataset")
args = parser.parse_args()
return args

parser.add_argument("--top_n",
type=int,
required=False,
default=1,
help="Top n results to consider correct")
args = parser.parse_args()
return args

parser.add_argument("--propose_cnt",
type=int,
required=False,
default=10,
help="Number of contexts to test question on")
args = parser.parse_args()
return args


def wrap_select(a, i, n):
"""
a - list to select from
i - index of a to start from
n - number of things to select
"""
b = []

#true answer as first possition
b.append(a[i])
j = (i+1)%len(a)
while len(b) < n+1 :
if a[j] not in b:
b.append(a[(j)])
j = (j+1)%len(a)
return b


def process_data_mult_text(question, contexts):
"""
Adds CLS, and SEP tokens to input text
Run through tokenizer
Returns tokenized
"""
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

input_text = []
input_ids = []
token_type_ids = []
for i in range(len(contexts)):
text = "[CLS] " + question + " [SEP] " + contexts[i] + " [SEP]"
input_text.append(text)

encoded = tokenizer.encode(text)
input_ids.append(encoded)

token_type_id = [0 if i <= encoded.index(102) else 1 ]
token_type_ids.append(token_type_id)

return input_text, input_ids, token_type_ids


def get_3110_set(data_path, include_impossible = False):
"""Returns the 3110 dataset data.
"""
cwd = os.getcwd()

question, text, answer, labels, ids, is_impossible = [], [], [], [], [], []
with open(data_path) as f:
data = json.load(f)["data"]

for d in data:
if (include_impossible or d["is_impossible"]==0 ):
is_impossible.append(1 if d["is_impossible"] else 0)
question.append(d["question"])
text.append(d["context"])
answer.append(d["answer"])
ids.append(d["id"])

return question, text, answer, is_impossible



def evaluate_on_multiple_context(questions, contexts, answers, eval_on = 10, top_n = 1):


model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
results = np.zeros(len(questions))

for i,q in enumerate(questions):

#get possible passages to test question on
context_sub = wrap_select(contexts, i, eval_on)

#format questions and context for bert input
input_text, input_ids, token_type_ids = process_data_mult_text(q, context_sub)
scores_avg = np.zeros(eval_on)

#pass input through bert saving avg of max value in logits
for j in range (eval_on):
start_scores, end_scores = model(torch.tensor([input_ids[j]]), token_type_ids=torch.tensor([token_type_ids[j]]), return_dict=False)
soft_start = softmax(start_scores)
soft_end = softmax(end_scores)
scores_avg[j] = (torch.max(soft_start) + torch.max(soft_end))/2

arr = scores_avg.argsort()[-top_n:][::-1]
results[i] = 1 if 0 in arr else 0


return results


def main(data_path, top_n, propose_cnt):

print('Getting data')
questions, contexts, answers, is_impossible = get_3110_set(data_path)

print("Beginning Evaluation")
results = evaluate_on_multiple_context(questions, contexts, answers, eval_on=p_cnt, top_n=top_n)
print("Finished Evaluation")
print("Acc: ", np.sum(results)/len(results))


if __name__ == '__main__':
args = passed_arguments()
data_path = args.data_path
top_n = args.top_n
p_cnt = arg.propose_cnt

main(data_path, top_n, p_cnt)
195 changes: 195 additions & 0 deletions q_and_a/app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,195 @@
from flask import Flask, request, render_template, make_response
from haystack.document_stores import FAISSDocumentStore
from haystack.nodes import PDFToTextConverter, PreProcessor, FARMReader, DensePassageRetriever
from haystack.pipelines import ExtractiveQAPipeline
from haystack.schema import Document
from pipeline import answer
import pymysql
import json
import os

app = Flask(__name__)

app.config['SQLALCHEMY_DATABASE_URI'] = 'mysql+pymysql://admin_mci:mycourseindex-qa@database-qa.cp4ury9dboly.us-east-1.rds.amazonaws.com/qa_docstore'
# app.config["input"] = "/usr/src/app/data/input"
app.config["input"] = "./data/input"
app.config["host"] = "0.0.0.0"

def create_dpr(document_store):
retriever = DensePassageRetriever(
document_store=document_store,
query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
max_seq_len_query=64,
max_seq_len_passage=256,
batch_size=16,
use_gpu=True,
embed_title=True,
use_fast_tokenizers=True,
)
# document_store.update_embeddings(retriever)
# document_store.save(index_path="haystack_test_faiss", config_path="haystack_test_faiss_config")
# reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True, progress_bar=False, top_k_per_candidate=2)

return retriever

def joinParagraph(str):
s = str.replace('\n', ' ').replace('*', '').replace('%temp%', 'e').replace('```{code-cell} ocaml', '').replace('\ ', '').replace('`', '')
return re.sub('\\\s',' ', s)

@app.route("/query_pipe",methods=['POST'])
def query_pipe():
q=request.form['question']
len_ans=int(request.form['lenans'])
len_retriever=int(request.form['lenretr'])
prediction = pipe.run(query=q, params={"Retriever": {"top_k": len_retriever}, "Reader": {"top_k": len_ans}})
doc_ids = [prediction['answers'][i].document_id for i in range(len_ans)]
docs = document_store.get_documents_by_id(doc_ids)
docs = [d.content for d in docs]
ans = [prediction['answers'][i].answer for i in range(len_ans)]
return json.dumps({
'status':'success',
'message': 'Process succesfully',
'result': ans,
'context': docs})

@app.route("/query",methods=['POST'])
def query():
q=request.form['question']
len_ans=int(request.form['lenans'])
len_retriever=int(request.form['lenretr'])
context = retriever.retrieve(query=q, top_k=len_retriever)
# prediction = pipe.run(query=q, params={"Retriever": {"top_k": len_retriever}, "Reader": {"top_k": len_ans}})
ans = answer(1, context, q)
return json.dumps({'status':'success','message': 'Process succesfully', 'result': ans})

@app.route('/')
def home():
"""Return a friendly HTTP greeting."""
return 'Hello QNA API is running'

#endpoint to update embedded method
@app.route('/set_embed', methods=['POST'])
def set_embed():
"""Return a friendly HTTP greeting."""
# document_store.write_documents()
document_store.update_embeddings(retriever, update_existing_embeddings=False)
document_store.save("haystack_test_faiss", "haystack_test_faiss_config")
return json.dumps({'status':'Susccess','message': 'Sucessfully embeded method updated in FAISS Document', 'result': document_store.get_embedding_count()})

@app.route('/get_docs')
def get_docs():
"""Return a friendly HTTP greeting."""
# document_store.write_documents()
res=document_store.get_all_documents()[0].content
return json.dumps({'status':'Susccess','message': 'Sucessfully embeded method updated in FAISS Document', 'result': res})


@app.route('/update_docstore_pdf', methods=['POST'])
def update_document():
"""Return a the url of the index document."""
if request.files:
prev = document_store.get_document_count()
# uploaded document for target source
doc = request.files["doc"]
file_path = os.path.join(app.config["input"], doc.filename)
# saving the file to the input directory
doc.save(file_path)
# convert the pdf files into dictionary and update to Document
converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=["en"])
# doc_pdf = converter.convert(file_path=Path(f"{doc_dir}/3110.pdf"), meta=None)[0]
doc_pdf = converter.convert(file_path=filepath, meta=None)[0]
doc_pdf.content = join_Paragraph(doc_pdf.content)

preprocessor = PreProcessor(
clean_empty_lines=True,
clean_whitespace=True,
clean_header_footer=False,
split_by="word",
split_length=100,
split_respect_sentence_boundary=True,
)
docs_default = preprocessor.process(doc_pdf)

document_store.write_documents(docs_default)
now = document_store.get_document_count()
# os.remove(file_path)

return json.dumps(
{'status':'Susccess', 'result': {'file': doc.filename, 'prevcount': prev, "currcount": now}})
else:
return json.dumps({'status':'Failed','message': 'No file uploaded', 'result': []})

@app.route('/update_docstore_json', methods=['POST'])
def update_json():
# doc.seek(0)
# contents = doc.read()
prev = document_store.get_document_count()
filename = request.form['filename']
file_path = os.path.join(app.config["input"], filename)
with open(str(file_path), 'r') as j:
contents = json.load(j)

contentlists=[]
for x in contents:
# print(x.keys())
ans = [i['content'] for i in x['_source']['answers']]
# x.clear()
# x['content'] = x['_source']['content'] + '.'.join(ans)
contentlists.append({'content': x['_source']['content'] + '.'.join(ans)})

field_map = {"content": "content"}
contentlists = [Document.from_dict(d) for d in contentlists]

document_store.write_documents(contentlists)
os.remove(file_path)
now = document_store.get_document_count()

return json.dumps(
{'status':'Susccess', 'result': {'file': filename, 'prevcount': prev, "currcount": now}})
# else:
# return json.dumps({'status':'Failed','message': 'No file uploaded', 'result': []})


@app.route('/upload_document', methods=['POST'])
def upload_document():
"""Return a the url of the index document."""
if request.files:
# uploaded document for target source
prev = document_store.get_document_count()
doc = request.files["doc"]
file_path = os.path.join(app.config["input"], doc.filename)
# saving the file to the input directory
doc.save(file_path)
# with open(str(file_path), 'r') as j:
# doc.seek(0)
# contents = doc.read()
# for x in contents:
# ans = [i['content'] for i in x['_source']['answers']]
# x['content'] = x['_source']['content'] + '.'.join(ans)
# field_map = {"content": "content"}
# contentlists = [Document.from_dict(d) for d in content]

# document_store.write_documents(contentlists)
# # os.remove(file_path)
now = document_store.get_document_count()

return json.dumps(
{'status':'Susccess', 'result': {'file': doc.filename, 'prevcount': prev, "currcount": now}})
else:
return json.dumps({'status':'Failed','message': 'No file uploaded', 'result': []})

if __name__ == '__main__':
port = int(os.environ.get("PORT", 5000))
# document_store = ElasticsearchDocumentStore(
# host = cluster_ip,
# scheme="https",
# index='cs_4780_sp2021',
# username="mciesaccess",
# password="mcioscar"
# )
document_store = FAISSDocumentStore.load(index_path="haystack_test_faiss", config_path="haystack_test_faiss_config")
retriever = create_dpr(document_store)
reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True, progress_bar=True, return_no_answer=True)
pipe = ExtractiveQAPipeline(reader, retriever)
app.run(host=app.config["host"], port=port, debug=True)
Loading