scripts/number_html5.py at master · HeatherW/scripts · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
'''Automatic numbering script
This script does the following:
1. Updates the anchor tags to have the correct figure or table reference. This is only needed for science at present.
2. Updates the figure and table caption tags to have a prefix of table x: or figure x:. Again only for science.
3. Updates chapter titles, section titles and subsection titles. Chapter titles need chapter x: as  a prefix, section titles need 'no. title shortcode' and subsection titles just need the shortcode at the end.
4. Numbers the worked examples.
5. Numbers the exercises
'''

# Import the necessary items
import os

from lxml import etree

'/home/heather/Desktop/books/mathematics-10/afrikaans/build/epubs/maths10/OPS/xhtml/maths10'
path = '/home/heather/Desktop/books/scripts/tests/sample-files-for-testing/unnumbered_files'
file_list = os.listdir(path)
file_list.sort()


class NumberingClass():
    '''
    The workhorse for numbering all the pieces of the html
    '''

    def __init__(self, file_list):
        self.file_list = file_list
        self.numbered_files = {}
        self.section_number = 1
        self.figure_number = 1
        self.table_number = 1
        self.worked_example_number = 1
        self.exercise_number = 1
        self.table_dictionary = {}
        self.figure_dictionary = {}

    def number_files(self, write_back_to_file_boolean=True):
        for file_name in self.file_list:
            # Skip directories
            full_file_name = '{}/{}'.format(path, file_name)
            if os.path.isdir(full_file_name):
                continue
            if file_name[0] not in ['0', '1', '2']:  # we need to ignore anything that does not start with a number
                continue

            self.numbered_files[file_name] = self.number_file(full_file_name)

        if not write_back_to_file_boolean:
            return

        for file_name in self.numbered_files.keys():
            if self.numbered_files[file_name]:
                full_file_name = '{}/{}'.format(path, file_name)
                file_text = self.number_file(full_file_name)
                self.write_back_to_file(file_text, full_file_name)

    def number_file(self, full_file_name):
        xml = etree.parse(full_file_name, etree.HTMLParser())

        heading1 = xml.find('.//h1')
        sections = xml.findall('.//section')
        divs = xml.findall('.//div')
        figures = xml.findall('.//figure[@id]')
        anchors = xml.findall('.//a')
        find_chapter_number_index = full_file_name.rfind('/')
        file_number = int(full_file_name
                          [find_chapter_number_index+1:find_chapter_number_index+3])

        def chapter_number_insert(self):
            # Create the chapter titles
            if heading1 is not None:
                newText = 'Chapter {}: {}'.format(file_number, heading1.text)
                heading1.text = newText
                return heading1


        file_counter = int(full_file_name[-17:-15])  # the end of every file name is .cnxmlplus.html which has length of 15, this extracts the number at the tail end of the file name
        # Handle section, subsection, exercise, worked example, table and figure numbering
        #if file_counter == 0:
            #section_number = 1
            #figure_number = 1
            #table_number = 1
            #worked_example_counter = 1
            #exercise_counter = 1
        #else:
            #for section in sections:
                #if section.find('h3') is not None:  # subsection headings
                    #h3 = section.find('h3')
                    #try:
                        #shortcode = etree.Element('span')
                        #shortcode.text = '({})'.format(section.attrib['id'][2:])
                        #shortcode.set('class', 'shortcode')
                        #h3.text = '{} '.format(h3.text)
                        #h3.append(shortcode)
                    #except KeyError:
                        #continue
                #if section.attrib['class'] == 'worked_example':  # worked examples
                    #title = section.find('h2')
                    #if title is not None:
                        #title.text = 'Worked example ' + str(worked_example_counter) + ':' + title.text  # while .format or other string concatenation methods might work better this handles unicode errors better
                        #worked_example_counter += 1
                #if section.attrib['class'] == 'exercises':  # exercises
                    #try:
                        #problem_set = section.find('.//div[@class]')
                        #if problem_set.attrib['class'] == 'problemset':
                            #span_code = etree.Element('span')
                            #span_code.set('class', 'exerciseTitle')
                            #span_code.text = 'Exercise {}.{}'.format(file_number, exercise_counter)
                            #problem_set.insert(0, span_code)
                            #exercise_counter += 1
                    #except AttributeError:
                        #continue
                #if section.find('h2') is not None:  # section headings
                    #h2 = section.find('h2')
                    #try:
                        #if section.attrib['id'] is not None and section.attrib['id'][:2] == 'sc':
                            #shortcode = etree.Element('span')
                            #shortcode.text = '({})'.format(section.attrib['id'][2:])
                            #shortcode.set('class', 'shortcode')
                            #h2.text = '{}.{} {}'.format(file_number, self.section_number, h2.text)
                            #h2.append(shortcode)
                            #self.section_number += 1
                    #except KeyError:
                        #continue


        # figure numbering
        def figure_number_insert(self):
            for figure in figures:
                caption = figure.find('.//figcaption')
                if caption is not None and figure.attrib['id'] is not None:
                    if caption.find('.//p') is not None:
                        para = caption.find('.//p')
                        para.text = 'Figure {}.{}: {}'.format(file_number, self.figure_number, para.text)
                    else:
                        caption.text = 'Figure {}.{}: {}'.format(file_number, self.figure_number, caption.text)
                    self.figure_dictionary[figure.attrib['id']] = str(file_number) + '.' + str(self.figure_number)
                    self.figure_number += 1
                    return caption


        # table numbering
        def table_number_insert(self):
            for div in divs:
                try:
                    if div.attrib['class'] is not None:
                        if div.attrib['id'] is not None and div.attrib['class'] == 'FigureTable':
                            caption = div.find('.//div[@caption]')
                            if caption is not None:
                                if caption.find('.//p') is not None:
                                    para = caption.find('.//p')
                                    para.text = 'Table {}.{}: {}'.format(file_number, table_number, para.text)
                                else:
                                    caption.text = 'Table {}.{}: {}'.format(file_number, table_number, caption.text)
                                table_dictionary[div.attrib['id']] = str(file_number) + '.' + str(table_number)
                                table_number += 1
                                return caption
                except KeyError:
                    continue


        # replace the anchor tags
        def hyperlink_text_fix(self):
            for anchor in anchors:
                try:
                    if anchor.attrib['class'] == 'InternalLink':
                        if anchor.attrib['href'][1:] in table_dictionary.keys():
                            anchor.text = 'Table ' + table_dictionary[anchor.attrib['href'][1:]]
                        elif anchor.attrib['href'][1:] in figure_dictionary.keys():
                            anchor.text = 'Figure ' + figure_dictionary[anchor.attrib['href'][1:]]
                        return anchor
                except KeyError:
                    continue

        if chapter_number_insert(self) is not None:
            xml = chapter_number_insert(self)
        if figure_number_insert(self) is not None:
            xml = figure_number_insert(self)
        if table_number_insert(self) is not None:
            xml = table_number_insert(self)
        if hyperlink_text_fix(self) is not None:
            xml = hyperlink_text_fix(self)

        file_text = None

        file_text = etree.tostring(xml, pretty_print=True)
        return file_text

    def write_back_to_file(self, file_text, full_file_name):
        # This overwrites the contents of the file you are working with.
        # Only do this if you are sure you can get the contents of the file back
        # before the script was run.
        with open(full_file_name, 'w') as file:
            file.write(file_text)