-
-
Notifications
You must be signed in to change notification settings - Fork 179
Expand file tree
/
Copy pathdocument.rb
More file actions
executable file
·201 lines (169 loc) · 5.43 KB
/
document.rb
File metadata and controls
executable file
·201 lines (169 loc) · 5.43 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
require 'docx/containers'
require 'docx/elements'
require 'nokogiri'
require 'zip'
module Docx
# The Document class wraps around a docx file and provides methods to
# interface with it.
#
# # get a Docx::Document for a docx file in the local directory
# doc = Docx::Document.open("test.docx")
#
# # get the text from the document
# puts doc.text
#
# # do the same thing in a block
# Docx::Document.open("test.docx") do |d|
# puts d.text
# end
class Document
attr_reader :xml, :doc, :zip, :styles
def initialize(path_or_io, options = {})
@replace = {}
# if path-or_io is string && does not contain a null byte
if (path_or_io.instance_of?(String) && !/\u0000/.match?(path_or_io))
@zip = Zip::File.open(path_or_io)
else
@zip = Zip::File.open_buffer(path_or_io)
end
document = @zip.glob('word/document*.xml').first
raise Errno::ENOENT if document.nil?
@document_xml = document.get_input_stream.read
@doc = Nokogiri::XML(@document_xml)
load_styles
yield(self) if block_given?
ensure
@zip.close
end
# This stores the current global document properties, for now
def document_properties
{
font_size: font_size,
hyperlinks: hyperlinks
}
end
# With no associated block, Docx::Document.open is a synonym for Docx::Document.new. If the optional code block is given, it will be passed the opened +docx+ file as an argument and the Docx::Document oject will automatically be closed when the block terminates. The values of the block will be returned from Docx::Document.open.
# call-seq:
# open(filepath) => file
# open(filepath) {|file| block } => obj
def self.open(path, &block)
new(path, &block)
end
def paragraphs
@doc.xpath('//w:document//w:body/w:p').map { |p_node| parse_paragraph_from p_node }
end
def bookmarks
bkmrks_hsh = {}
bkmrks_ary = @doc.xpath('//w:bookmarkStart').map { |b_node| parse_bookmark_from b_node }
# auto-generated by office 2010
bkmrks_ary.reject! { |b| b.name == '_GoBack' }
bkmrks_ary.each { |b| bkmrks_hsh[b.name] = b }
bkmrks_hsh
end
def tables
@doc.xpath('//w:document//w:body//w:tbl').map { |t_node| parse_table_from t_node }
end
# Some documents have this set, others don't.
# Values are returned as half-points, so to get points, that's why it's divided by 2.
def font_size
return nil unless @styles
size_tag = @styles.xpath('//w:docDefaults//w:rPrDefault//w:rPr//w:sz').first
size_tag ? size_tag.attributes['val'].value.to_i / 2 : nil
end
# Hyperlink targets are extracted from the document.xml.rels file
def hyperlinks
hyperlink_relationships.each_with_object({}) do |rel, hash|
hash[rel.attributes['Id'].value] = rel.attributes['Target'].value
end
end
def hyperlink_relationships
@rels.xpath("//xmlns:Relationship[contains(@Type,'hyperlink')]")
end
##
# *Deprecated*
#
# Iterates over paragraphs within document
# call-seq:
# each_paragraph => Enumerator
def each_paragraph
paragraphs.each { |p| yield(p) }
end
# call-seq:
# to_s -> string
def to_s
paragraphs.map(&:to_s).join("\n")
end
# Output entire document as a String HTML fragment
def to_html
paragraphs.map(&:to_html).join("\n")
end
# Save document to provided path
# call-seq:
# save(filepath) => void
def save(path)
update
Zip::OutputStream.open(path) do |out|
zip.each do |entry|
next unless entry.file?
out.put_next_entry(entry.name)
if @replace[entry.name]
out.write(@replace[entry.name])
else
out.write(zip.read(entry.name))
end
end
end
zip.close
end
# Output entire document as a StringIO object
def stream
update
stream = Zip::OutputStream.write_buffer do |out|
zip.each do |entry|
next unless entry.file?
out.put_next_entry(entry.name)
if @replace[entry.name]
out.write(@replace[entry.name])
else
out.write(zip.read(entry.name))
end
end
end
stream.rewind
stream
end
alias text to_s
def replace_entry(entry_path, file_contents)
@replace[entry_path] = file_contents
end
private
def load_styles
@styles_xml = @zip.read('word/styles.xml')
@styles = Nokogiri::XML(@styles_xml)
@rels_xml = @zip.read('word/_rels/document.xml.rels')
@rels = Nokogiri::XML(@rels_xml)
rescue Errno::ENOENT => e
warn e.message
nil
end
#--
# TODO: Flesh this out to be compatible with other files
# TODO: Method to set flag on files that have been edited, probably by inserting something at the
# end of methods that make edits?
#++
def update
replace_entry 'word/document.xml', doc.serialize(save_with: 0)
end
# generate Elements::Containers::Paragraph from paragraph XML node
def parse_paragraph_from(p_node)
Elements::Containers::Paragraph.new(p_node, document_properties)
end
# generate Elements::Bookmark from bookmark XML node
def parse_bookmark_from(b_node)
Elements::Bookmark.new(b_node)
end
def parse_table_from(t_node)
Elements::Containers::Table.new(t_node)
end
end
end