Skip to content

Commit 087b747

Browse files
authored
Merge pull request #92 from metanorma/features/list-start
allow lists with arbitrary starting number: https://github.com/metano…
2 parents 3fa03fe + b20a881 commit 087b747

6 files changed

Lines changed: 476 additions & 22 deletions

File tree

lib/html2doc/base.rb

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ def initialize(hash)
1313
@imagedir = hash[:imagedir]
1414
@debug = hash[:debug]
1515
@liststyles = hash[:liststyles]
16-
@stylesheet = hash[:stylesheet]
16+
@stylesheet = read_stylesheet(hash[:stylesheet])
1717
@c = HTMLEntities.new
1818
end
1919

@@ -74,8 +74,7 @@ def cleanup(docxml)
7474
end
7575

7676
def locate_landscape(_docxml)
77-
css = read_stylesheet(@stylesheet)
78-
@landscape = css.scan(/div\.\S+\s+\{\s*page:\s*[^;]+L;\s*\}/m)
77+
@landscape = @stylesheet.scan(/div\.\S+\s+\{\s*page:\s*[^;]+L;\s*\}/m)
7978
.map { |e| e.sub(/^div\.(\S+).*$/m, "\\1") }
8079
end
8180

@@ -99,11 +98,9 @@ def filename_substitute(head, header_filename)
9998
end
10099
end
101100

102-
def stylesheet(_filename, _header_filename, cssname)
103-
stylesheet = read_stylesheet(cssname)
101+
def stylesheet(_filename, _header_filename, _cssname)
102+
stylesheet = "#{@stylesheet}\n#{@newliststyledefs}"
104103
xml = Nokogiri::XML("<style/>")
105-
# s = Nokogiri::XML::CDATA.new(xml, "\n#{stylesheet}\n")
106-
# xml.children.first << Nokogiri::XML::Comment.new(xml, s)
107104
xml.children.first << Nokogiri::XML::CDATA
108105
.new(xml, "\n<!--\n#{stylesheet}\n-->\n")
109106
xml.root.to_s

lib/html2doc/lists.rb

Lines changed: 101 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,7 @@
44

55
class Html2Doc
66
def style_list(elem, level, liststyle, listnumber)
7-
return unless liststyle
8-
7+
liststyle or return
98
if elem["style"]
109
elem["style"] += ";"
1110
else
@@ -30,16 +29,37 @@ def list_add1(elem, liststyles, listtype, level)
3029

3130
def list_add(xpath, liststyles, listtype, level)
3231
xpath.each do |l|
33-
level == 1 and l["seen"] = true and @listnumber += 1
32+
level == 1 && l["seen"] = true and @listnumber += 1
3433
l["id"] ||= UUIDTools::UUID.random_create
34+
liststyle = derive_liststyle(l, liststyles[listtype], level)
3535
(l.xpath(".//li") - l.xpath(".//ol//li | .//ul//li")).each do |li|
36-
style_list(li, level, liststyles[listtype], @listnumber)
36+
style_list(li, level, liststyle, @listnumber)
3737
list_add1(li, liststyles, listtype, level)
3838
end
3939
list_add_tail(l, liststyles, listtype, level)
4040
end
4141
end
4242

43+
def derive_liststyle(list, liststyle, level)
44+
list["start"] && list["start"] != "1" or return liststyle
45+
@liststyledefsidx += 1
46+
ret = "l#{@liststyledefsidx}"
47+
@newliststyledefs += newliststyle(list["start"], liststyle, ret, level)
48+
ret
49+
end
50+
51+
def newliststyle(start, liststyle, newstylename, level)
52+
s = @liststyledefs[liststyle]
53+
.gsub(/@list\s+#{liststyle}/, "@list #{newstylename}")
54+
.sub(/@list\s+#{newstylename}\s+\{[^}]*\}/m, <<~LISTSTYLE)
55+
@list #{newstylename}\n{mso-list-id:#{rand(100_000_000..999_999_999)};
56+
mso-list-template-ids:#{rand(100_000_000..999_999_999)};}
57+
LISTSTYLE
58+
.sub(/@list\s+#{newstylename}:level#{level}\s+\{/m,
59+
"\\0mso-level-start-at:#{start};\n")
60+
"#{s}\n"
61+
end
62+
4363
def list_add_tail(list, liststyles, listtype, level)
4464
list.xpath(".//ul[not(ancestor::li/ancestor::*/@id = '#{list['id']}')] | "\
4565
".//ol[not(ancestor::li/ancestor::*/@id = '#{list['id']}')]")
@@ -49,16 +69,15 @@ def list_add_tail(list, liststyles, listtype, level)
4969
end
5070

5171
def list2para(list)
52-
return if list.xpath("./li").empty?
53-
72+
list.xpath("./li").empty? and return
5473
list2para_position(list)
5574
list.xpath("./li").each do |l|
5675
l.name = "p"
5776
l["class"] ||= "MsoListParagraphCxSpMiddle"
58-
next unless l.first_element_child&.name == "p"
59-
77+
l.first_element_child&.name == "p" or next
6078
l["style"] ||= ""
61-
l["style"] += (l.first_element_child["style"]&.sub(/mso-list[^;]+;/, "") || "")
79+
l["style"] += l.first_element_child["style"]
80+
&.sub(/mso-list[^;]+;/, "") || ""
6281
l.first_element_child.replace(l.first_element_child.children)
6382
end
6483
list.replace(list.children)
@@ -100,12 +119,82 @@ def lists_unstyled(docxml, liststyles)
100119
end
101120

102121
def lists(docxml, liststyles)
103-
return if liststyles.nil?
104-
105-
@listnumber = 0
122+
liststyles.nil? and return
123+
parse_stylesheet_line_styles
106124
liststyles.each_key { |k| lists1(docxml, liststyles, k) }
107125
lists_unstyled(docxml, liststyles)
108126
liststyles.has_key?(:ul) and docxml.xpath("//ul").each { |u| list2para(u) }
109127
liststyles.has_key?(:ol) and docxml.xpath("//ol").each { |u| list2para(u) }
110128
end
129+
130+
def parse_stylesheet_line_styles
131+
@listnumber = 0
132+
result = process_stylesheet_lines(@stylesheet.split("\n"))
133+
@liststyledefs = clean_result_content(result)
134+
@newliststyledefs = ""
135+
@liststyledefsidx = @liststyledefs.keys.map do |k|
136+
k.sub(/^.*(\d+)$/, "\\1").to_i
137+
end.max
138+
end
139+
140+
private
141+
142+
def extract_list_name(line)
143+
match = line.match(/^\s*@list\s+([^:\s]+)(?::.*)?/)
144+
match ? match[1] : nil
145+
end
146+
147+
def list_declaration?(line)
148+
!extract_list_name(line).nil?
149+
end
150+
151+
def save_current_list(result, current_base, current_content)
152+
current_base.nil? || current_content.empty? and return result
153+
if result[current_base]
154+
result[current_base] += current_content
155+
else
156+
result[current_base] = current_content
157+
end
158+
result
159+
end
160+
161+
def process_stylesheet_lines(lines)
162+
result = {}
163+
current_base = nil
164+
current_content = ""
165+
parsing_active = false
166+
167+
lines.each do |line|
168+
if list_declaration?(line)
169+
base_name = extract_list_name(line)
170+
if current_base == base_name
171+
current_content += "#{line}\n"
172+
else
173+
# save accumulated list style definition, new list style
174+
save_current_list(result, current_base, current_content)
175+
current_base = base_name
176+
current_content = "#{line}\n"
177+
end
178+
parsing_active = true
179+
180+
elsif parsing_active && line.include?("}")
181+
# End of current block - add this line and stop parsing
182+
current_content += "#{line}\n"
183+
parsing_active = false
184+
185+
elsif parsing_active
186+
# Continue adding content while parsing is active
187+
current_content += "#{line}\n"
188+
end
189+
# If parsing_active is false and no @list declaration, skip the line
190+
end
191+
# Save the last list if we were still parsing
192+
save_current_list(result, current_base, current_content)
193+
result
194+
end
195+
196+
def clean_result_content(result)
197+
result.each { |k, v| result[k] = v.rstrip }
198+
result
199+
end
111200
end

lib/html2doc/mime.rb

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -135,8 +135,7 @@ def skip_image_cleanup?(img)
135135
# Scan both @stylesheet and docxml.to_xml (where @standardstylesheet has ended up)
136136
# Allow 0.9 * height to fit caption
137137
def page_dimensions(docxml)
138-
stylesheet = read_stylesheet(@stylesheet)
139-
page_size = find_page_size_in_doc(stylesheet, docxml.to_xml) or
138+
page_size = find_page_size_in_doc(@stylesheet, docxml.to_xml) or
140139
return [680, 400]
141140
m_size = /size:\s*(\S+)\s+(\S+)\s*;/.match(page_size) or return [680, 400]
142141
m_marg = /margin:\s*(\S+)\s+(\S+)\s*(\S+)\s*(\S+)\s*;/.match(page_size) or

lib/html2doc/version.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
class Html2Doc
2-
VERSION = "1.9.2".freeze
2+
VERSION = "1.10.0".freeze
33
end

spec/html2doc_spec.rb

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -263,6 +263,9 @@ def word_body(xml, footnote)
263263
DEFAULT_STYLESHEET = File.read("lib/html2doc/wordstyle.css",
264264
encoding: "utf-8").freeze
265265

266+
NEW_LIST_STYLES = File.read("spec/wordstyle-custom-lists.css",
267+
encoding: "utf-8")
268+
266269
def guid_clean(xml)
267270
xml.gsub(/NextPart_[0-9a-f.]+/, "NextPart_")
268271
end
@@ -965,6 +968,32 @@ def image_clean(xml)
965968
OUTPUT
966969
end
967970

971+
it "sets custom start numbering of lists" do
972+
simple_body = <<~BODY
973+
<div>
974+
<ol id="1"><li><div><p><ol id="2" start="3"><li><ul id="3"><li><p><ol id="4" start="5"><li><ol id="5" start="7"><li>A</li></ol></li></ol></p></li></ul></li></ol></p></div></li></ol>
975+
<ol id="6" start="2"><li><div><p><ol id="7" start="1"><li><ul id="8"><li><p><ol id="9"><li><ol id="10"><li>A</li></ol></li></ol></p></li></ul></li></ol></p></div></li></ol></div>
976+
BODY
977+
Html2Doc.new(filename: "test", liststyles: { ul: "l0", ol: "l1" })
978+
.process(html_input(simple_body))
979+
style = "#{DEFAULT_STYLESHEET} #{NEW_LIST_STYLES}"
980+
.gsub(/mso-list-id:\d+/, "mso-list-id:_")
981+
.gsub(/mso-list-template-ids:\d+/, "mso-list-template-ids:_")
982+
doc = File.read("test.doc", encoding: "utf-8")
983+
.gsub(/mso-list-id:\d+/, "mso-list-id:_")
984+
.gsub(/mso-list-template-ids:\d+/, "mso-list-template-ids:_")
985+
986+
expect(guid_clean(doc))
987+
.to match_fuzzy(<<~OUTPUT)
988+
#{WORD_HDR} #{style} #{WORD_HDR_END}
989+
#{word_body('<div>
990+
<p style="mso-list:l1 level1 lfo1;" class="MsoListParagraphCxSpFirst"><div><p class="MsoNormal"><p style="mso-list:l2 level2 lfo1;" class="MsoListParagraphCxSpFirst"><p style="mso-list:l3 level4 lfo1;" class="MsoListParagraphCxSpFirst"><p style="mso-list:l4 level5 lfo1;" class="MsoListParagraphCxSpFirst">A</p></p></p></p></div></p>
991+
<p style="mso-list:l5 level1 lfo2;" class="MsoListParagraphCxSpFirst"><div><p class="MsoNormal"><p style="mso-list:l1 level2 lfo2;" class="MsoListParagraphCxSpFirst"><p style="mso-list:l1 level4 lfo2;" class="MsoListParagraphCxSpFirst"><p style="mso-list:l1 level5 lfo2;" class="MsoListParagraphCxSpFirst">A</p></p></p></p></div></p></div>',
992+
'<div style="mso-element:footnote-list"/>')}
993+
#{WORD_FTR1}
994+
OUTPUT
995+
end
996+
968997
it "labels lists with multiple list styles" do
969998
simple_body = <<~BODY
970999
<div><ul class="steps" id="0">

0 commit comments

Comments
 (0)