1+ '''
2+ MIT License
3+
4+ Copyright (c) 2023 Ulster University (https://www.ulster.ac.uk).
5+ Project: Harmony (https://harmonydata.ac.uk)
6+ Maintainer: Thomas Wood (https://fastdatascience.com)
7+
8+ Permission is hereby granted, free of charge, to any person obtaining a copy
9+ of this software and associated documentation files (the "Software"), to deal
10+ in the Software without restriction, including without limitation the rights
11+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12+ copies of the Software, and to permit persons to whom the Software is
13+ furnished to do so, subject to the following conditions:
14+
15+ The above copyright notice and this permission notice shall be included in all
16+ copies or substantial portions of the Software.
17+
18+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24+ SOFTWARE.
25+
26+ '''
27+
28+ import re
29+ import traceback
30+ from typing import List
31+ from langdetect import detect
32+ from langdetect .lang_detect_exception import LangDetectException
33+ from harmony .schemas .requests .text import RawFile , Instrument , Question
34+
35+ re_capi_code = re .compile (r'^([A-Z]{2,8}[0-9]*)$' )
36+
37+
38+ def is_capi_format (text : str ) -> bool :
39+ """Detect if text content is in CAPI format based on density of uppercase variable codes."""
40+ if not text or len (text ) < 100 :
41+ return False
42+
43+ lines = text .split ("\n " )
44+
45+ # count lines starting with CAPI-style codes (e.g. AHATEA, BHOWREL)
46+ capi_code_pattern = re .compile (r'^[A-Z][A-Z0-9][A-Z0-9]+\s' )
47+ capi_line_count = 0
48+
49+ for line in lines :
50+ if capi_code_pattern .match (line .strip ()):
51+ capi_line_count += 1
52+
53+ # consider it CAPI if more than 2% of lines have codes and at least 10 found
54+ ratio = capi_line_count / max (len (lines ), 1 )
55+ return capi_line_count >= 10 and ratio >= 0.02
56+
57+
58+ def extract_capi_questions (text : str ) -> List [dict ]:
59+ """Extract questions from CAPI formatted text. Variable codes appear on their own line
60+ with question text on following lines. Routing shown with | characters."""
61+ lines = text .split ("\n " )
62+ questions = []
63+
64+ for idx , line in enumerate (lines ):
65+ line_stripped = line .strip ()
66+
67+ # skip empty lines and table of contents entries
68+ if not line_stripped or '...' in line_stripped or '___' in line_stripped :
69+ continue
70+
71+ match = re_capi_code .match (line_stripped )
72+ if match :
73+ code = match .group (1 )
74+
75+ # skip common non-question codes
76+ skip_codes = ['CARD' , 'NOTE' , 'READ' , 'CODE' , 'TEXT' , 'ENDIF' , 'ELSE' , 'AND' , 'THE' , 'FOR' ]
77+ if code in skip_codes or len (code ) < 3 :
78+ continue
79+
80+ # look at next lines for question text
81+ question_text = ""
82+ for next_idx in range (idx + 1 , min (idx + 10 , len (lines ))):
83+ next_line = lines [next_idx ].strip ()
84+
85+ # remove routing indicators
86+ next_line = next_line .lstrip ('|' ).strip ()
87+
88+ # stop if we hit another CAPI code
89+ if re_capi_code .match (next_line ):
90+ break
91+
92+ # stop if empty line after we have some text
93+ if not next_line and question_text :
94+ break
95+
96+ # skip interviewer instructions (all caps)
97+ if next_line .isupper () and len (next_line ) > 10 :
98+ continue
99+
100+ # stop at answer options (lines starting with numbers)
101+ if re .match (r'^\d+\s' , next_line ):
102+ break
103+
104+ # skip header/footer lines
105+ if 'Module' in next_line and any (char .isdigit () for char in next_line ):
106+ continue
107+ if 'Millennium Cohort' in next_line :
108+ continue
109+
110+ if next_line :
111+ question_text += " " + next_line
112+
113+ question_text = re .sub (r'\s+' , ' ' , question_text ).strip ()
114+
115+ # skip feed-forward metadata
116+ if '(from feed forward)' in question_text .lower ():
117+ continue
118+
119+ # only add if meaningful question text
120+ if len (question_text ) > 15 and ('?' in question_text or len (question_text ) > 30 ):
121+ questions .append ({
122+ 'question_no' : code ,
123+ 'question_text' : question_text ,
124+ 'line_idx' : idx
125+ })
126+
127+ return questions
128+
129+
130+ def convert_capi_to_instruments (file : RawFile , text_content : str ) -> List [Instrument ]:
131+ """Convert a CAPI formatted PDF to Harmony Instruments."""
132+ extracted_questions = extract_capi_questions (text_content )
133+
134+ if len (extracted_questions ) == 0 :
135+ return []
136+
137+ questions = []
138+ for q in extracted_questions :
139+ question = Question (
140+ question_no = q ['question_no' ],
141+ question_intro = "" ,
142+ question_text = q ['question_text' ],
143+ options = [],
144+ source_page = 0
145+ )
146+ questions .append (question )
147+
148+ language = "en"
149+ try :
150+ all_question_texts = [q ['question_text' ] for q in extracted_questions ]
151+ valid_texts = [t for t in all_question_texts if isinstance (t , str ) and t .strip ()]
152+ if valid_texts :
153+ language = detect (" " .join (valid_texts ))
154+ except LangDetectException :
155+ print ("Error identifying language in CAPI file" )
156+ traceback .print_exc ()
157+
158+ instrument = Instrument (
159+ file_id = file .file_id ,
160+ instrument_id = file .file_id + "_0" ,
161+ file_name = file .file_name ,
162+ instrument_name = file .file_name ,
163+ file_type = file .file_type ,
164+ file_section = "CAPI" ,
165+ language = language ,
166+ questions = questions
167+ )
168+
169+ return [instrument ]
0 commit comments