Skip to content

Commit 8e1cebe

Browse files
authored
Merge pull request #124 from OscarAR46/feature/issue-capi-parser-57
Hi @OscarAR46 , thanks so much, and thanks for the reminder. Sorry I didn't get time to check it. It all looks good to me! I'll ping you when it's deployed live to the server and we can test it in the front end.
2 parents d96d8bb + 8f15007 commit 8e1cebe

2 files changed

Lines changed: 174 additions & 0 deletions

File tree

src/harmony/parsing/capi_parser.py

Lines changed: 169 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
'''
2+
MIT License
3+
4+
Copyright (c) 2023 Ulster University (https://www.ulster.ac.uk).
5+
Project: Harmony (https://harmonydata.ac.uk)
6+
Maintainer: Thomas Wood (https://fastdatascience.com)
7+
8+
Permission is hereby granted, free of charge, to any person obtaining a copy
9+
of this software and associated documentation files (the "Software"), to deal
10+
in the Software without restriction, including without limitation the rights
11+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12+
copies of the Software, and to permit persons to whom the Software is
13+
furnished to do so, subject to the following conditions:
14+
15+
The above copyright notice and this permission notice shall be included in all
16+
copies or substantial portions of the Software.
17+
18+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24+
SOFTWARE.
25+
26+
'''
27+
28+
import re
29+
import traceback
30+
from typing import List
31+
from langdetect import detect
32+
from langdetect.lang_detect_exception import LangDetectException
33+
from harmony.schemas.requests.text import RawFile, Instrument, Question
34+
35+
re_capi_code = re.compile(r'^([A-Z]{2,8}[0-9]*)$')
36+
37+
38+
def is_capi_format(text: str) -> bool:
39+
"""Detect if text content is in CAPI format based on density of uppercase variable codes."""
40+
if not text or len(text) < 100:
41+
return False
42+
43+
lines = text.split("\n")
44+
45+
# count lines starting with CAPI-style codes (e.g. AHATEA, BHOWREL)
46+
capi_code_pattern = re.compile(r'^[A-Z][A-Z0-9][A-Z0-9]+\s')
47+
capi_line_count = 0
48+
49+
for line in lines:
50+
if capi_code_pattern.match(line.strip()):
51+
capi_line_count += 1
52+
53+
# consider it CAPI if more than 2% of lines have codes and at least 10 found
54+
ratio = capi_line_count / max(len(lines), 1)
55+
return capi_line_count >= 10 and ratio >= 0.02
56+
57+
58+
def extract_capi_questions(text: str) -> List[dict]:
59+
"""Extract questions from CAPI formatted text. Variable codes appear on their own line
60+
with question text on following lines. Routing shown with | characters."""
61+
lines = text.split("\n")
62+
questions = []
63+
64+
for idx, line in enumerate(lines):
65+
line_stripped = line.strip()
66+
67+
# skip empty lines and table of contents entries
68+
if not line_stripped or '...' in line_stripped or '___' in line_stripped:
69+
continue
70+
71+
match = re_capi_code.match(line_stripped)
72+
if match:
73+
code = match.group(1)
74+
75+
# skip common non-question codes
76+
skip_codes = ['CARD', 'NOTE', 'READ', 'CODE', 'TEXT', 'ENDIF', 'ELSE', 'AND', 'THE', 'FOR']
77+
if code in skip_codes or len(code) < 3:
78+
continue
79+
80+
# look at next lines for question text
81+
question_text = ""
82+
for next_idx in range(idx + 1, min(idx + 10, len(lines))):
83+
next_line = lines[next_idx].strip()
84+
85+
# remove routing indicators
86+
next_line = next_line.lstrip('|').strip()
87+
88+
# stop if we hit another CAPI code
89+
if re_capi_code.match(next_line):
90+
break
91+
92+
# stop if empty line after we have some text
93+
if not next_line and question_text:
94+
break
95+
96+
# skip interviewer instructions (all caps)
97+
if next_line.isupper() and len(next_line) > 10:
98+
continue
99+
100+
# stop at answer options (lines starting with numbers)
101+
if re.match(r'^\d+\s', next_line):
102+
break
103+
104+
# skip header/footer lines
105+
if 'Module' in next_line and any(char.isdigit() for char in next_line):
106+
continue
107+
if 'Millennium Cohort' in next_line:
108+
continue
109+
110+
if next_line:
111+
question_text += " " + next_line
112+
113+
question_text = re.sub(r'\s+', ' ', question_text).strip()
114+
115+
# skip feed-forward metadata
116+
if '(from feed forward)' in question_text.lower():
117+
continue
118+
119+
# only add if meaningful question text
120+
if len(question_text) > 15 and ('?' in question_text or len(question_text) > 30):
121+
questions.append({
122+
'question_no': code,
123+
'question_text': question_text,
124+
'line_idx': idx
125+
})
126+
127+
return questions
128+
129+
130+
def convert_capi_to_instruments(file: RawFile, text_content: str) -> List[Instrument]:
131+
"""Convert a CAPI formatted PDF to Harmony Instruments."""
132+
extracted_questions = extract_capi_questions(text_content)
133+
134+
if len(extracted_questions) == 0:
135+
return []
136+
137+
questions = []
138+
for q in extracted_questions:
139+
question = Question(
140+
question_no=q['question_no'],
141+
question_intro="",
142+
question_text=q['question_text'],
143+
options=[],
144+
source_page=0
145+
)
146+
questions.append(question)
147+
148+
language = "en"
149+
try:
150+
all_question_texts = [q['question_text'] for q in extracted_questions]
151+
valid_texts = [t for t in all_question_texts if isinstance(t, str) and t.strip()]
152+
if valid_texts:
153+
language = detect(" ".join(valid_texts))
154+
except LangDetectException:
155+
print("Error identifying language in CAPI file")
156+
traceback.print_exc()
157+
158+
instrument = Instrument(
159+
file_id=file.file_id,
160+
instrument_id=file.file_id + "_0",
161+
file_name=file.file_name,
162+
instrument_name=file.file_name,
163+
file_type=file.file_type,
164+
file_section="CAPI",
165+
language=language,
166+
questions=questions
167+
)
168+
169+
return [instrument]

src/harmony/parsing/pdf_parser.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
'''
2727

2828
import re
29+
from harmony.parsing.capi_parser import is_capi_format, convert_capi_to_instruments
2930

3031
import torch
3132
from harmony.parsing.util.tika_wrapper import parse_pdf_to_list
@@ -137,6 +138,10 @@ def convert_pdf_to_instruments(file: RawFile) -> Instrument:
137138
pages = [file.text_content]
138139
pages = [file.text_content]
139140

141+
# Check if this is a CAPI format PDF - use regex-based parser instead of ML model
142+
if is_capi_format(file.text_content):
143+
return convert_capi_to_instruments(file, file.text_content)
144+
140145
# Run prediction script to return questions and answers from file text content
141146

142147
question_texts_entire_document = []

0 commit comments

Comments
 (0)