|
5 | 5 | from vrdu.block import Block |
6 | 6 | from vrdu.config import config |
7 | 7 | from vrdu import utils |
| 8 | +from vrdu import logger |
| 9 | + |
| 10 | +log = logger.get_logger(__name__) |
8 | 11 |
|
9 | 12 |
|
10 | 13 | class OrderAnnotation: |
@@ -128,76 +131,92 @@ def generate_cross_reference_order(self): |
128 | 131 |
|
129 | 132 | def generate_float_envs_order(self): |
130 | 133 | # annotations = [] |
131 | | - pattern = r"\\label\{(.*?)\}" |
| 134 | + label_pattern = r"\\label\{(.*?)\}" |
132 | 135 | # 0, add labels for titles |
| 136 | + # TODO: add labels for other types of titles |
133 | 137 | for block in self.annotations["annotations"]: |
134 | 138 | if config.category2name[block.category] != "Title": |
135 | 139 | continue |
136 | | - block.labels = re.findall(pattern, block.source_code) |
| 140 | + block.labels = re.findall(label_pattern, block.source_code) |
137 | 141 |
|
138 | 142 | # 1. add labels for equations |
139 | 143 | for block in self.annotations["annotations"]: |
140 | 144 | if config.category2name[block.category] != "Equation": |
141 | 145 | continue |
142 | | - block.labels = re.findall(pattern, block.source_code) |
| 146 | + block.labels = re.findall(label_pattern, block.source_code) |
143 | 147 |
|
144 | | - # 2. match caption to tabulars and generate labels |
| 148 | + # 2. add labels for float envs |
| 149 | + # colored_tex_file = self.tex_file.replace("paper_original", "paper_colored") |
145 | 150 | with open(self.tex_file, "r") as f: |
146 | 151 | latex_content = f.read() |
147 | 152 | # find the intetval of tables |
148 | | - table_pattern = re.compile( |
149 | | - r"\\begin\{table\*?\}(.*?)\\end\{table\*?\}", re.DOTALL |
150 | | - ) |
151 | | - table_indices = [] |
152 | | - for _match in table_pattern.finditer(latex_content): |
153 | | - table_indices.append((_match.start(), _match.end(), str(uuid4()))) |
| 153 | + category_to_patterns = { |
| 154 | + "Table": re.compile( |
| 155 | + r"\\begin\{table\*?\}(.*?)\\end\{table\*?\}", re.DOTALL |
| 156 | + ), |
| 157 | + "Figure": re.compile( |
| 158 | + r"\\begin\{figure\*?\}(.*?)\\end\{figure\*?\}", re.DOTALL |
| 159 | + ), |
| 160 | + "Algorithm": re.compile( |
| 161 | + r"\\begin\{algorithm\*?\}(.*?)\\end\{algorithm\*?\}", re.DOTALL |
| 162 | + ), |
| 163 | + } |
154 | 164 |
|
155 | | - # find the interval of tabulars |
156 | | - for block in self.annotations["annotations"]: |
157 | | - if config.category2name[block.category] != "Table": |
158 | | - continue |
159 | | - start_index = latex_content.find(block.source_code) |
160 | | - if start_index == -1: |
161 | | - continue |
162 | | - end_index = start_index + len(block.source_code) |
| 165 | + category_to_indicdes = {} |
| 166 | + for category, pattern in category_to_patterns.items(): |
| 167 | + category_to_indicdes[category] = [] |
| 168 | + indices = pattern.finditer(latex_content) |
| 169 | + # we add a uuid to match for float environments in case |
| 170 | + # there are no explicit cite |
| 171 | + for _match in indices: |
| 172 | + category_to_indicdes[category].append( |
| 173 | + (_match.start(), _match.end(), str(uuid4())) |
| 174 | + ) |
| 175 | + log.debug( |
| 176 | + f"find {len(category_to_indicdes[category])} {category} in {self.tex_file}" |
| 177 | + ) |
| 178 | + for category_name, indices in category_to_indicdes.items(): |
| 179 | + # find labels for those float environments |
| 180 | + for block in self.annotations["annotations"]: |
| 181 | + if config.category2name[block.category] != category_name: |
| 182 | + continue |
| 183 | + |
| 184 | + log.debug(f"processing {block.source_code}") |
| 185 | + start_index = latex_content.find(block.source_code) |
| 186 | + if start_index == -1: |
| 187 | + continue |
| 188 | + end_index = start_index + len(block.source_code) |
| 189 | + |
| 190 | + for index in indices: |
| 191 | + if start_index < index[0] or end_index > index[1]: |
| 192 | + continue |
163 | 193 |
|
164 | | - for table_index in table_indices: |
165 | | - if start_index >= table_index[0] and end_index <= table_index[1]: |
| 194 | + log.debug(f"wrapper: {latex_content[index[0] : index[1]]}") |
166 | 195 | labels = re.findall( |
167 | | - pattern, latex_content[table_index[0] : table_index[1]] |
| 196 | + label_pattern, latex_content[index[0] : index[1]] |
168 | 197 | ) |
169 | 198 | block.labels = labels |
170 | | - if not block.labels: |
171 | | - block.labels = [table_index[2]] |
172 | | - # find the interval of captions |
173 | | - for block in self.annotations["annotations"]: |
174 | | - if config.category2name[block.category] != "Caption": |
175 | | - continue |
176 | | - start_index = latex_content.find(block.source_code) |
177 | | - if start_index == -1: |
178 | | - continue |
179 | | - end_index = start_index + len(block.source_code) |
180 | | - for table_index in table_indices: |
181 | | - if start_index >= table_index[0] and end_index <= table_index[1]: |
182 | | - labels = re.findall( |
183 | | - pattern, latex_content[table_index[0] : table_index[1]] |
184 | | - ) |
185 | | - block.references = labels |
186 | | - if not block.references: |
187 | | - block.references = [table_index[2]] |
188 | | - # match caption to tables and generate labels |
189 | | - |
190 | | - # 3. match caption to figure and generate labels |
191 | | - # TODO: complete this |
192 | | - # 4. match caption to algorithms and generate labels |
193 | | - |
194 | | - # 5. match caption to codings and generate labels |
195 | | - |
196 | | - # caption to env |
197 | | - # label to env |
198 | | - # 1. caption-env attach, implicit cite and add label |
199 | | - # 2. equation-label attach, add label |
200 | | - # self.annotations["orders"].extend(annotations) |
| 199 | + block.labels.append(index[2]) |
| 200 | + |
| 201 | + log.debug(f"labels: {block.labels}") |
| 202 | + |
| 203 | + # add references for captions to those float environments |
| 204 | + for block in self.annotations["annotations"]: |
| 205 | + if config.category2name[block.category] != "Caption": |
| 206 | + continue |
| 207 | + log.debug(f"processing {block.source_code}") |
| 208 | + start_index = latex_content.find(block.source_code) |
| 209 | + if start_index == -1: |
| 210 | + continue |
| 211 | + end_index = start_index + len(block.source_code) |
| 212 | + for index in indices: |
| 213 | + if start_index < index[0] or end_index > index[1]: |
| 214 | + continue |
| 215 | + |
| 216 | + log.debug(f"wrapper: {latex_content[index[0] : index[1]]}") |
| 217 | + block.references = [index[2]] |
| 218 | + |
| 219 | + log.debug(f"references: {block.references}") |
201 | 220 |
|
202 | 221 | def generate_sortable_envs_order(self): |
203 | 222 | annotations = [] |
|
0 commit comments