-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathextract_worked_examples.py
More file actions
36 lines (27 loc) · 1.24 KB
/
Copy pathextract_worked_examples.py
File metadata and controls
36 lines (27 loc) · 1.24 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
'''
This script extracts the worked examples from book files.
The titles are added to the dictionary with the appropriate number.
'''
import os
from lxml import etree
path = '/home/heather/Desktop/books/physical-sciences-11/afrikaans'
wex_dictionary = {}
for file_name in os.listdir(path):
full_file_name = '{}/{}'.format(path, file_name)
# Skip directories
if os.path.isdir(full_file_name):
continue
# now we have another issue: the directory does not only contain xml files, we need to remove those that do not contain xml.
if file_name[-9:] != 'cnxmlplus':
continue
xml = etree.XML(open(full_file_name, 'r').read())
wex_counter = 1 # set a counter
for wex in xml.findall('.//worked_example'): # find all the worked examples
title = wex.find('.//title') # find and extract the title
title = title.text # this should in theory just give me the text
wex_dictionary[title] = wex_counter # create a python dictionary with the title as the key and the number as the value
wex_counter += 1 # increment the counter
# rinse and repeat
# write the contents of the dictionary
with open('gr12-science-wexes.txt', 'w') as file:
file.write(str(wex_dictionary))