An example receiver class that processes all text found in a PDF file. All text that is found will be printed to the IO object specified in the constructor.
Usage:
receiver = PDF::Reader::TextReceiver.new($stdout) PDF::Reader.file("somefile.pdf", receiver)
DEPRECATED: this class was deprecated in version 0.11.0 and will
eventually be removed
Initialize with the library user's receiver
# File lib/pdf/reader/text_receiver.rb, line 40 def initialize (main_receiver) @main_receiver = main_receiver @upper_corners = [] end
Called when the document parsing begins
# File lib/pdf/reader/text_receiver.rb, line 46 def begin_document (root) @upper_corners = [] end
Called when new page parsing begins
# File lib/pdf/reader/text_receiver.rb, line 64 def begin_page (info) @page = info @state = [{ :char_spacing => 0, :word_spacing => 0, :hori_scaling => 100, :leading => 0, :tj_adjustment => 0, }] @upper_corners.push(media_box_check(info)) @output = [] @line = 0 @location = 0 @displacement = {} @smallest_y_loc = @upper_corners.last[:ury] @written_to = false end
# File lib/pdf/reader/text_receiver.rb, line 55 def begin_page_container (page) @upper_corners.push(media_box_check(page)) end
PDF operator BT
# File lib/pdf/reader/text_receiver.rb, line 92 def begin_text_object @state.push(@state.last.dup) end
# File lib/pdf/reader/text_receiver.rb, line 234 def calculate_line_and_location (new_loc) ##puts "calculate_line_and_location(#{new_loc})" key = new_loc; key.freeze #key = new_loc.to_s # because hashes with string keys are magic (auto-freeze) if @written_to unless @displacement.has_key?(key) if key < @location @displacement[key] = @line + 1 elsif key < @smallest_y_loc @displacement[key] = @line + 1 else key = @displacement.keys.find_all {|i| key > i}.sort.last @displacement[key] = 0 unless @displacement.has_key?(key) end end else @displacement[key] = 0 end @smallest_y_loc = key if key < @smallest_y_loc @location = key @line = @displacement[key] #puts "calculate_line_and_location: @location=#@location @line=#@line smallest_y_loc=#@smallest_y_loc" end
Called when the document parsing ends
# File lib/pdf/reader/text_receiver.rb, line 51 def end_document @state.clear end
Called when page parsing ends
# File lib/pdf/reader/text_receiver.rb, line 86 def end_page @main_receiver << @output.join("\n") @upper_corners.pop end
# File lib/pdf/reader/text_receiver.rb, line 59 def end_page_container @upper_corners.pop end
PDF operator ET
# File lib/pdf/reader/text_receiver.rb, line 97 def end_text_object @state.pop end
# File lib/pdf/reader/text_receiver.rb, line 222 def media_box_check (dict) corners = (@upper_corners.last || {:urx => 0, :ury => 0}).dup if dict.has_key?(:MediaBox) media_box = dict[:MediaBox] corners[:urx] = media_box[2] - media_box[0] corners[:ury] = media_box[3] - media_box[1] end corners end
PDF operator Td
# File lib/pdf/reader/text_receiver.rb, line 134 def move_text_position (tx, ty) #puts "#{tx} #{ty} Td" calculate_line_and_location(@location + ty) end
PDF operator TD
# File lib/pdf/reader/text_receiver.rb, line 140 def move_text_position_and_set_leading (tx, ty) set_text_leading(ty)# * -1) move_text_position(tx, ty) end
PDF operator '
# File lib/pdf/reader/text_receiver.rb, line 210 def move_to_next_line_and_show_text (string) move_to_start_of_next_line show_text(string) end
PDF operator T*
# File lib/pdf/reader/text_receiver.rb, line 129 def move_to_start_of_next_line move_text_position(0, @state.last[:leading]) end
PDF operator Tc
# File lib/pdf/reader/text_receiver.rb, line 109 def set_character_spacing (n) @state.last[:char_spacing] = n end
PDF operator Tz
# File lib/pdf/reader/text_receiver.rb, line 119 def set_horizontal_text_scaling (n) @state.last[:hori_scaling] = n/100 end
PDF operator "
# File lib/pdf/reader/text_receiver.rb, line 216 def set_spacing_next_line_show_text (aw, ac, string) set_word_spacing(aw) set_character_spacing(ac) move_to_next_line_and_show_text(string) end
PDF operator TL
# File lib/pdf/reader/text_receiver.rb, line 124 def set_text_leading (n) @state.last[:leading] = n end
PDF operator Tm
# File lib/pdf/reader/text_receiver.rb, line 102 def set_text_matrix_and_text_line_matrix (*args) # these variable names look bad, but they're from the PDF spec a, b, c, d, e, f = *args calculate_line_and_location(f) end
PDF operator Tw
# File lib/pdf/reader/text_receiver.rb, line 114 def set_word_spacing (n) @state.last[:word_spacing] = n end
PDF operator Tj
# File lib/pdf/reader/text_receiver.rb, line 146 def show_text (string) #puts "getting line #@line" place = (@output[@line] ||= "") #place << " " unless place.empty? place << " " * (@state.last[:tj_adjustment].abs/900) if @state.last[:tj_adjustment] < -1000 place << string #puts "place is now: #{place}" @written_to = true end
PDF operator TJ
# File lib/pdf/reader/text_receiver.rb, line 194 def show_text_with_positioning (params) prev_adjustment = @state.last[:tj_adjustment] params.each do |p| case p when Float, Fixnum @state.last[:tj_adjustment] = p else show_text(p) end end @state.last[:tj_adjustment] = prev_adjustment end
# File lib/pdf/reader/text_receiver.rb, line 158 def super_show_text (string) urx = @upper_corners.last[:urx]/TS_UNITS_PER_H_CHAR ury = @upper_corners.last[:ury]/TS_UNITS_PER_V_CHAR x = (@tm[2,0]/TS_UNITS_PER_H_CHAR).to_i y = (ury - (@tm[2,1]/TS_UNITS_PER_V_CHAR)).to_i #puts "rendering '#{string}' to #{x}x#{y}" place = (@output[y] ||= (" " * urx.to_i)) #puts "#{urx} #{place.size} #{string.size} #{x}" return if x+string.size >= urx string.split(%r/).each do |c| chars = 1 case c when " " chars += @state.last[:word_spacing].to_i place[x-1, chars] = (" " * chars) else chars += @state.last[:char_spacing].to_i chars -= (@state.last[:tj_adjustment]/1000).to_i if @state.last[:tj_adjustment] chars = 1 if chars < 1 place[x-1] = c place[x, chars-1] = (" " * (chars-1)) if chars > 1 end x += chars end @tm += Matrix.rows([[1, 0, 0], [0, 1, 0], [x*TS_UNITS_PER_H_CHAR, y*TS_UNITS_PER_V_CHAR, 1]]) end