-
Notifications
You must be signed in to change notification settings - Fork 2
/
docx_extract_bodytext.rb
executable file
·82 lines (54 loc) · 1.74 KB
/
docx_extract_bodytext.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
#!/usr/bin/env ruby -W0
###################################################
###
## File: docx_extract_bodytext.rb
## Desc: Displays the "BodyText" styled paragraphs of *.docx files
#
require 'amazing_print' # Pretty print Ruby objects with proper indentation and colors
require 'pathname' # STDLIB
require 'docx' # a ruby library/gem for interacting with .docx files
if ARGV.empty? or ARGV.first == "-h" or ARGV.first == "--help"
puts
puts "Usage: docx_draft_style.rb [options] MS_WORD_DOCX++"
puts
puts " Where:"
puts " MS_WORD_DOCX++ is one or more Microsoft Word DOCX filenames"
puts
puts " options are:"
puts
puts " -h or --help This usage message is produced"
puts
exit
end
# SMELL: This returns a string in which the spaces have been squeezed out
# TODO: Move to the lib/*.rb file
def paragraph_style(para)
begin
style_element = para.node.children.children.first.attributes.first.last
#ap style_element
style_element.value.to_s
rescue Exception => e
#puts "ERROR: #{e}"
#ap para.node
"Normal"
end
end # of def paragraph_style(para)
######################################################################
## Main Loop around the ARGV which should contain only file names
out_file = STDOUT
ARGV.each do |param|
given_document = Pathname.new(param)
unless '.docx' == given_document.extname.downcase
STDERR.puts
STDERR.puts "WARNING: Not a *.docx file -- skipping."
STDERR.puts " File: #{given_document}"
STDERR.puts
next
end
d = Docx::Document.open(given_document.to_s)
d.paragraphs.each do |para|
style = paragraph_style(para)
puts "\n" + para.text if 'BodyText' == style
end
puts
end # end of ARGV.each do |param|