Skip to content

Commit 800884e

Browse files
MaoSong2022MaoSong2022
authored andcommitted
feat(order_annotation.py): add labels, refs for caption and float environments
1 parent 5657aae commit 800884e

1 file changed

Lines changed: 71 additions & 52 deletions

File tree

vrdu/order_annotation.py

Lines changed: 71 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,9 @@
55
from vrdu.block import Block
66
from vrdu.config import config
77
from vrdu import utils
8+
from vrdu import logger
9+
10+
log = logger.get_logger(__name__)
811

912

1013
class OrderAnnotation:
@@ -128,76 +131,92 @@ def generate_cross_reference_order(self):
128131

129132
def generate_float_envs_order(self):
130133
# annotations = []
131-
pattern = r"\\label\{(.*?)\}"
134+
label_pattern = r"\\label\{(.*?)\}"
132135
# 0, add labels for titles
136+
# TODO: add labels for other types of titles
133137
for block in self.annotations["annotations"]:
134138
if config.category2name[block.category] != "Title":
135139
continue
136-
block.labels = re.findall(pattern, block.source_code)
140+
block.labels = re.findall(label_pattern, block.source_code)
137141

138142
# 1. add labels for equations
139143
for block in self.annotations["annotations"]:
140144
if config.category2name[block.category] != "Equation":
141145
continue
142-
block.labels = re.findall(pattern, block.source_code)
146+
block.labels = re.findall(label_pattern, block.source_code)
143147

144-
# 2. match caption to tabulars and generate labels
148+
# 2. add labels for float envs
149+
# colored_tex_file = self.tex_file.replace("paper_original", "paper_colored")
145150
with open(self.tex_file, "r") as f:
146151
latex_content = f.read()
147152
# find the intetval of tables
148-
table_pattern = re.compile(
149-
r"\\begin\{table\*?\}(.*?)\\end\{table\*?\}", re.DOTALL
150-
)
151-
table_indices = []
152-
for _match in table_pattern.finditer(latex_content):
153-
table_indices.append((_match.start(), _match.end(), str(uuid4())))
153+
category_to_patterns = {
154+
"Table": re.compile(
155+
r"\\begin\{table\*?\}(.*?)\\end\{table\*?\}", re.DOTALL
156+
),
157+
"Figure": re.compile(
158+
r"\\begin\{figure\*?\}(.*?)\\end\{figure\*?\}", re.DOTALL
159+
),
160+
"Algorithm": re.compile(
161+
r"\\begin\{algorithm\*?\}(.*?)\\end\{algorithm\*?\}", re.DOTALL
162+
),
163+
}
154164

155-
# find the interval of tabulars
156-
for block in self.annotations["annotations"]:
157-
if config.category2name[block.category] != "Table":
158-
continue
159-
start_index = latex_content.find(block.source_code)
160-
if start_index == -1:
161-
continue
162-
end_index = start_index + len(block.source_code)
165+
category_to_indicdes = {}
166+
for category, pattern in category_to_patterns.items():
167+
category_to_indicdes[category] = []
168+
indices = pattern.finditer(latex_content)
169+
# we add a uuid to match for float environments in case
170+
# there are no explicit cite
171+
for _match in indices:
172+
category_to_indicdes[category].append(
173+
(_match.start(), _match.end(), str(uuid4()))
174+
)
175+
log.debug(
176+
f"find {len(category_to_indicdes[category])} {category} in {self.tex_file}"
177+
)
178+
for category_name, indices in category_to_indicdes.items():
179+
# find labels for those float environments
180+
for block in self.annotations["annotations"]:
181+
if config.category2name[block.category] != category_name:
182+
continue
183+
184+
log.debug(f"processing {block.source_code}")
185+
start_index = latex_content.find(block.source_code)
186+
if start_index == -1:
187+
continue
188+
end_index = start_index + len(block.source_code)
189+
190+
for index in indices:
191+
if start_index < index[0] or end_index > index[1]:
192+
continue
163193

164-
for table_index in table_indices:
165-
if start_index >= table_index[0] and end_index <= table_index[1]:
194+
log.debug(f"wrapper: {latex_content[index[0] : index[1]]}")
166195
labels = re.findall(
167-
pattern, latex_content[table_index[0] : table_index[1]]
196+
label_pattern, latex_content[index[0] : index[1]]
168197
)
169198
block.labels = labels
170-
if not block.labels:
171-
block.labels = [table_index[2]]
172-
# find the interval of captions
173-
for block in self.annotations["annotations"]:
174-
if config.category2name[block.category] != "Caption":
175-
continue
176-
start_index = latex_content.find(block.source_code)
177-
if start_index == -1:
178-
continue
179-
end_index = start_index + len(block.source_code)
180-
for table_index in table_indices:
181-
if start_index >= table_index[0] and end_index <= table_index[1]:
182-
labels = re.findall(
183-
pattern, latex_content[table_index[0] : table_index[1]]
184-
)
185-
block.references = labels
186-
if not block.references:
187-
block.references = [table_index[2]]
188-
# match caption to tables and generate labels
189-
190-
# 3. match caption to figure and generate labels
191-
# TODO: complete this
192-
# 4. match caption to algorithms and generate labels
193-
194-
# 5. match caption to codings and generate labels
195-
196-
# caption to env
197-
# label to env
198-
# 1. caption-env attach, implicit cite and add label
199-
# 2. equation-label attach, add label
200-
# self.annotations["orders"].extend(annotations)
199+
block.labels.append(index[2])
200+
201+
log.debug(f"labels: {block.labels}")
202+
203+
# add references for captions to those float environments
204+
for block in self.annotations["annotations"]:
205+
if config.category2name[block.category] != "Caption":
206+
continue
207+
log.debug(f"processing {block.source_code}")
208+
start_index = latex_content.find(block.source_code)
209+
if start_index == -1:
210+
continue
211+
end_index = start_index + len(block.source_code)
212+
for index in indices:
213+
if start_index < index[0] or end_index > index[1]:
214+
continue
215+
216+
log.debug(f"wrapper: {latex_content[index[0] : index[1]]}")
217+
block.references = [index[2]]
218+
219+
log.debug(f"references: {block.references}")
201220

202221
def generate_sortable_envs_order(self):
203222
annotations = []

0 commit comments

Comments
 (0)