class PDF::Reader::PageTextReceiver
Builds a UTF-8 string of all the text on a single page by processing all the operaters in a content stream.
Constants
- SPACE
Attributes
Public Instance Methods
Source
# File lib/pdf/reader/page_text_receiver.rb, line 83 def content mediabox = @page.rectangles[:MediaBox] PageLayout.new(runs, mediabox).to_s end
deprecated
Source
# File lib/pdf/reader/page_text_receiver.rb, line 122 def invoke_xobject(label) @state.invoke_xobject(label) do |xobj| case xobj when PDF::Reader::FormXObject then xobj.walk(self) end end end
XObjects
Source
# File lib/pdf/reader/page_text_receiver.rb, line 108 def move_to_next_line_and_show_text(str) # ' @state.move_to_start_of_next_line show_text(str) end
Source
# File lib/pdf/reader/page_text_receiver.rb, line 43 def page=(page) @state = PageState.new(page) @page = page @content = [] @characters = [] end
starting a new page
Source
# File lib/pdf/reader/page_text_receiver.rb, line 50 def runs(opts = {}) runs = @characters if rect = opts.fetch(:rect, @page.rectangles[:CropBox]) runs = BoundingRectangleRunsFilter.runs_within_rect(runs, rect) end if opts.fetch(:skip_zero_width, true) runs = ZeroWidthRunsFilter.exclude_zero_width_runs(runs) end if opts.fetch(:skip_overlapping, true) runs = OverlappingRunsFilter.exclude_redundant_runs(runs) end runs = NoTextFilter.exclude_empty_strings(runs) if opts.fetch(:merge, true) runs = merge_runs(runs) end if (only_filter = opts.fetch(:only, nil)) runs = AdvancedTextRunFilter.only(runs, only_filter) end if (exclude_filter = opts.fetch(:exclude, nil)) runs = AdvancedTextRunFilter.exclude(runs, exclude_filter) end runs end
Source
# File lib/pdf/reader/page_text_receiver.rb, line 113 def set_spacing_next_line_show_text(aw, ac, string) # " @state.set_word_spacing(aw) @state.set_character_spacing(ac) move_to_next_line_and_show_text(string) end
Source
# File lib/pdf/reader/page_text_receiver.rb, line 92 def show_text(string) # Tj (AWAY) internal_show_text(string) end
Text Showing Operators
record text that is drawn on the page
Source
# File lib/pdf/reader/page_text_receiver.rb, line 96 def show_text_with_positioning(params) # TJ [(A) 120 (WA) 20 (Y)] params.each do |arg| if arg.is_a?(String) internal_show_text(arg) elsif arg.is_a?(Numeric) @state.process_glyph_displacement(0, arg, false) else # skip it end end end
Private Instance Methods
Source
# File lib/pdf/reader/page_text_receiver.rb, line 158 def apply_rotation(x, y) if @page.rotate == 90 tmp = x x = y y = tmp * -1 elsif @page.rotate == 180 y *= -1 x *= -1 elsif @page.rotate == 270 tmp = y y = x x = tmp * -1 end return x, y end
Source
# File lib/pdf/reader/page_text_receiver.rb, line 184 def group_chars_into_runs(chars) chars.each_with_object([]) do |char, runs| if runs.empty? runs << char elsif runs.last.mergable?(char) runs[-1] = runs.last + char else runs << char end end end
Source
# File lib/pdf/reader/page_text_receiver.rb, line 133 def internal_show_text(string) PDF::Reader::Error.validate_type_as_malformed(string, "string", String) if @state.current_font.nil? raise PDF::Reader::MalformedPDFError, "current font is invalid" end glyphs = @state.current_font.unpack(string) glyphs.each_with_index do |glyph_code, index| # paint the current glyph newx, newy = @state.trm_transform(0,0) newx, newy = apply_rotation(newx, newy) utf8_chars = @state.current_font.to_utf8(glyph_code) # apply to glyph displacment for the current glyph so the next # glyph will appear in the correct position glyph_width = @state.current_font.glyph_width_in_text_space(glyph_code) th = 1 scaled_glyph_width = glyph_width * @state.font_size * th unless utf8_chars == SPACE @characters << TextRun.new(newx, newy, scaled_glyph_width, @state.font_size, utf8_chars) end @state.process_glyph_displacement(glyph_width, 0, utf8_chars == SPACE) end end
Source
# File lib/pdf/reader/page_text_receiver.rb, line 176 def merge_runs(runs) runs.group_by { |char| char.y.to_i }.map { |y, chars| group_chars_into_runs(chars.sort) }.flatten.sort end
take a collection of TextRun
objects and merge any that are in close proximity