-
Notifications
You must be signed in to change notification settings - Fork 5
Expand file tree
/
Copy pathread_data.py
More file actions
41 lines (30 loc) · 1.5 KB
/
read_data.py
File metadata and controls
41 lines (30 loc) · 1.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
from pathlib import Path
import pandas as pd
import json
def read_plaintext_with_keywords(pdf_reports_path):
"""Return the plain text of the PDF reports together with the keywords and topics."""
path = (Path(pdf_reports_path) / "tags.csv").resolve()
tags = pd.read_csv(str(path))
df = pd.DataFrame(columns=["filename", "title", "keywords", "topics", "text", "metadata"])
path = (Path(pdf_reports_path) / "plaintext").resolve()
idx = 0
for path in Path(path).iterdir():
if path.suffix == '.json':
with path.open() as json_file:
parsed = json.load(json_file)
filename = path.stem
# Some documents do not have title among metadata.
title = parsed["metadata"]["title"] if "title" in parsed["metadata"].keys() else filename
topics = tags[tags.name == filename].topics.get_values()[0]
topics = topics.replace('\'', '\"')
topics = json.loads(topics)
keywords = tags[tags.name == filename].keywords.get_values()[0]
keywords = keywords.replace('\'', '\"')
keywords = json.loads(keywords)
df.loc[idx] = [filename, title, keywords, topics, parsed["content"], parsed["metadata"]]
idx += 1
df['num_keywords'] = df.keywords.apply(lambda x: len(x))
df['num_keywords'].value_counts()
df['num_topics'] = df.topics.apply(lambda x: len(x))
df['num_topics'].value_counts()
return df