From 2a63a8158579cae84dd3e3acd0e5b7e0da6c6b53 Mon Sep 17 00:00:00 2001
From: Nathan Stitt <nathan@stitt.org>
Date: Thu, 5 Feb 2015 11:35:59 -0600
Subject: [PATCH 1/6] Note needed fonts for CHI/JPN/KOR document support

---
 index.html | 50 ++++++++++++++++++++++++++++----------------------
 1 file changed, 28 insertions(+), 22 deletions(-)
diff --git a/index.html b/index.html
index be419ac..a5c37e4 100755
--- a/index.html
+++ b/index.html
@@ -159,6 +159,12 @@ <h2 id="installation">Installation &amp; Dependencies</h2>
         <tt>aptitude install libreoffice</tt><br />
         On the Mac, download and install <a href="http://www.libreoffice.org/download">the latest release</a>.
       </li>
+      <li>
+        (Optional) Install fonts to process documents that use <a href="https://help.ubuntu.com/community/Fonts#Chinese.2C_Japanese.2C_and_Korean_Fonts">Chinese, Japanese, and Korean Fonts</a>.
+        On Linux, use <b>aptitude</b>, <b>apt-get</b> or <b>yum</b>:<br />
+        <tt>aptitude install ttf-wqy-microhei ttf-wqy-zenhei ttf-kochi-gothic ttf-kochi-mincho fonts-nanum</tt><br />
+        On the Mac, the fonts should already be present. However you can always download the TTF files and install them using <a href="http://support.apple.com/en-us/HT201749">Font Book</a>.
+      </li>
     </ol>
 
     <p><i>
@@ -183,7 +189,7 @@ <h2 id="usage">Usage</h2>
       and format. Pass <tt>--pages</tt> or <tt>-p</tt> to choose the specific pages to
       image. Passing<br /> <tt>--size</tt> or <tt>-s</tt> will specify the desired
       image resolution, <tt>--density</tt> or <tt>-d</tt> will specify the DPI to rasterize the images
-      at during conversion by GraphicsMagick, and <tt>--format</tt> or <tt>-f</tt> 
+      at during conversion by GraphicsMagick, and <tt>--format</tt> or <tt>-f</tt>
       will select the format of the final images.
     </p>
 <pre>
@@ -201,7 +207,7 @@ <h2 id="usage">Usage</h2>
       pass <tt>--pages all</tt>. You can use the <tt>--ocr</tt> and <tt>--no-ocr</tt>
       flags to force OCR, or disable it, respectively. By default (if Tesseract is installed)
       Docsplit will OCR the text of each page for which it fails to extract text
-      directly from the document. Docsplit will also attempt to clean up garbage 
+      directly from the document. Docsplit will also attempt to clean up garbage
       characters in the OCR'd text &mdash; to disable this, pass the
       <tt>--no-clean</tt> flag.
     </p>
@@ -272,7 +278,7 @@ <h2 id="internals">Internals</h2>
       <a href="http://poppler.freedesktop.org/">Poppler</a>,
       <a href="http://www.accesspdf.com/pdftk/">PDFTK</a>,
       <a href="http://code.google.com/p/tesseract-ocr/">Tesseract</a>, and
-      <a href="http://www.libreoffice.org/">LibreOffice</a> libraries. 
+      <a href="http://www.libreoffice.org/">LibreOffice</a> libraries.
       Poppler is used to extract text and metadata from PDF documents,
       PDFTK is used to split them apart into pages, and GraphicsMagick is used to generate
       the page images (internally, it's rendering them with
@@ -291,7 +297,7 @@ <h2 id="internals">Internals</h2>
     </p>
 
     <h2 id="changes">Change Log</h2>
-    
+
     <p>
       <b class="header">0.7.6</b><small> &ndash; Nov. 16, 2014</small><br />
       Docsplit will now automatically use Tesseract's orientation detection model
@@ -308,7 +314,7 @@ <h2 id="changes">Change Log</h2>
       <b class="header">0.7.2</b><small> &ndash; Feb. 23, 2013</small><br />
       Bug fixes for LibreOffice support.
     </p>
-    
+
     <p>
       <b class="header">0.7.0</b><small> &ndash; Feb. 23, 2013</small><br />
       Docsplit now expresses a preference for LibreOffice over OpenOffice, with
@@ -317,81 +323,81 @@ <h2 id="changes">Change Log</h2>
       Improved unicode support now correctly collects non-ascii characters from
       pdfinfo.
     </p>
-    
+
     <p>
       <b class="header">0.6.4</b><small> &ndash; Nov. 12, 2012</small><br />
       Added a language flag for the Docsplit commandline, fixed several bugs,
       and began preparations for the deprecation of pdftk.
     </p>
-    
+
     <p>
       <b class="header">0.6.2</b><small> &ndash; Nov. 22, 2011</small><br />
       Bugfix to escape document names during file type detection.
     </p>
-    
+
     <p>
       <b class="header">0.6.1</b><small> &ndash; Nov. 18, 2011</small><br />
       Docsplit now supports converting documents using LibreOffice
       as well as OpenOffice, through JODConverter 3.0 beta4.
     </p>
-    
+
     <p>
       <b class="header">0.6.0</b><small> &ndash; Sept. 13, 2011</small><br />
-      Docsplit should now handle shelling out for documents with arbitrary 
-      characters in their filenames correctly, thanks to a series of 
+      Docsplit should now handle shelling out for documents with arbitrary
+      characters in their filenames correctly, thanks to a series of
       epic patches from Vladimir Rybas.
-      A <tt>--density</tt> option was added for specifying the resolution of 
+      A <tt>--density</tt> option was added for specifying the resolution of
       rasterization when generating images from documents.
       The image resolution for OCR has been doubled from 200 to 400 DPI &mdash;
-      this shouldn't make a noticeable difference for normal docs, but will make 
+      this shouldn't make a noticeable difference for normal docs, but will make
       a world of difference for the fine print.
       Docsplit now uses GraphicsMagick's <tt>--despeckle</tt> before OCR.
     </p>
-    
+
     <p>
       <b class="header">0.5.2</b><small> &ndash; May 13, 2011</small><br />
       For transparent conversion to PDF, made Docsplit prefer GraphicsMagick
       over OpenOffice, when the file format is one that GraphicsMagick is able
       to read: (png, gif, jpg, jpeg, tif, tiff, bmp, pnm, ppm, svg, eps).
     </p>
-    
+
     <p>
       <b class="header">0.5.1</b><small> &ndash; April 26, 2011</small><br />
       Minor tweaks to the <tt>TextCleaner</tt> to be more lenient about acryonms
       with hyphens, and words with four vowels in a row.
     </p>
-    
+
     <p>
       <b class="header">0.5.0</b><br />
       Added a <tt>Docsplit::TextCleaner</tt> class which is used to post-process
       OCR'd text, and remove garbage characters that are created when Tesseract
       encounters non-english text. To disable the cleanup, pass <tt>--no-clean</tt>.
     </p>
-    
+
     <p>
       <b class="header">0.4.1</b><br />
       Upgraded the JODConverter dependency for PDF conversion via OpenOffice to
-      3.0 beta. Added PNG, GIF, TIF, JPG, and BMP to the list of supported 
+      3.0 beta. Added PNG, GIF, TIF, JPG, and BMP to the list of supported
       formats.
     </p>
-    
+
     <p>
       <b class="header">0.3.4</b><br />
       Adding a suggested optimization from the GraphicsMagick list -- only ever
       generate one page image per GraphicsMagick call. Saves large amounts of
       disk space for tempfiles on long documents.
     </p>
-    
+
     <p>
       <b class="header">0.3.3</b><br />
       Start using the MAGICK_TMPDIR environment variable to prevent parallel
       Docsplit runs from having the potential to clobber each other's temporary
       image files.
     </p>
-    
+
     <p>
       <b class="header">0.3.1</b><br />
-      Added a memory limit to GraphicsMagick while generating the TIFFs for 
+      Added a memory limit to GraphicsMagick while generating the TIFFs for
       Tesseract OCR -- prevents <tt>gm</tt> from gobbling up all available memory
       on large files.
     </p>

From c42ce95a597bcb03738182f77b27e55f365c6109 Mon Sep 17 00:00:00 2001
From: Andrew Volozhanin <linuxheadrus@gmail.com>
Date: Wed, 17 Jun 2015 20:13:24 +0500
Subject: [PATCH 2/6] Add layout option to keep layout during text extraction

It passed -layout option to pdftotext.
---
 lib/docsplit/text_extractor.rb | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/lib/docsplit/text_extractor.rb b/lib/docsplit/text_extractor.rb
index 985abdd..93973f6 100644
--- a/lib/docsplit/text_extractor.rb
+++ b/lib/docsplit/text_extractor.rb
@@ -102,17 +102,26 @@ def run(command)
       result
     end
 
+    # Run pdftotext command
+    def run_pdftotext(pdf, text_path, options=[])
+      options << '-enc UTF-8'
+      options << '-layout' if @keep_layout
+
+      run "pdftotext #{options.join(' ')} #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1"
+    end
+
     # Extract the full contents of a pdf as a single file, directly.
     def extract_full(pdf)
       text_path = File.join(@output, "#{@pdf_name}.txt")
-      run "pdftotext -enc UTF-8 #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1"
+      run_pdftotext pdf, text_path
     end
 
     # Extract the contents of a single page of text, directly, adding it to
     # the `@pages_to_ocr` list if the text length is inadequate.
     def extract_page(pdf, page)
       text_path = File.join(@output, "#{@pdf_name}_#{page}.txt")
-      run "pdftotext -enc UTF-8 -f #{page} -l #{page} #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1"
+      run_pdftotext pdf, text_path, ["-f #{page}", "-l #{page}"]
+
       unless @forbid_ocr
         @pages_to_ocr.push(page) if File.read(text_path).length < MIN_TEXT_PER_PAGE
       end
@@ -126,6 +135,7 @@ def extract_options(options)
       @language           = options[:language] || 'eng'
       @clean_ocr          = (!(options[:clean] == false) and @language == 'eng')
       @detect_orientation = ((options[:detect_orientation] != false) and DEPENDENCIES[:osd])
+      @keep_layout        = options.fetch(:layout, false)
     end
 
   end

From ee355d5fa4f70400396d52a3d048ef359267a1ec Mon Sep 17 00:00:00 2001
From: "T. Kim Nguyen" <tkimnguyen@users.noreply.github.com>
Date: Thu, 12 Apr 2018 11:33:49 -0500
Subject: [PATCH 3/6] update pdftk installer URL for Mac

---
 index.html | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/index.html b/index.html
index a5c37e4..ccbcb95 100755
--- a/index.html
+++ b/index.html
@@ -149,7 +149,7 @@ <h2 id="installation">Installation &amp; Dependencies</h2>
         (Optional) Install <a href="http://www.accesspdf.com/pdftk/">pdftk</a>.
         On Linux, use <b>aptitude</b>, <b>apt-get</b> or <b>yum</b>:<br />
         <tt>aptitude install pdftk</tt><br />
-        On the Mac, you can <a href="http://www.pdflabs.com/tools/pdftk-the-pdf-toolkit/">download a recent installer</a> for the binary.
+        On the Mac, you can <a href="https://www.pdflabs.com/tools/pdftk-server/">download a recent installer</a> for the binary.
         Without <b>pdftk</b> installed, you can use Docsplit, but won't be able
         to split apart a multi-page PDF into single-page PDFs.
       </li>

From 6418e2531d87245a99d21a1ac942e46e53a41532 Mon Sep 17 00:00:00 2001
From: Anuja Ware-Mahajan <anuja@joshsoftware.com>
Date: Mon, 8 Nov 2021 15:58:21 +0530
Subject: [PATCH 4/6] Documentaion link correction

---
 README | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README b/README
index 34ce202..81374da 100755
--- a/README
+++ b/README
@@ -15,7 +15,7 @@
   gem install docsplit
   
   For documentation, usage, and examples, see:
-  http://documentcloud.github.com/docsplit/
+  https://documentcloud.github.io/docsplit/
   
   To suggest a feature or report a bug: 
   http://github.com/documentcloud/docsplit/issues/

From 990b0452b70b2d21e97157e60538e79c47723114 Mon Sep 17 00:00:00 2001
From: = <=>
Date: Fri, 5 May 2023 13:23:34 -0700
Subject: [PATCH 5/6] Fix deprecated method File.exists? to File.exist?

---
 lib/docsplit/page_extractor.rb | 4 ++--
 lib/docsplit/text_extractor.rb | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/lib/docsplit/page_extractor.rb b/lib/docsplit/page_extractor.rb
index 145c980..cb84e34 100644
--- a/lib/docsplit/page_extractor.rb
+++ b/lib/docsplit/page_extractor.rb
@@ -10,8 +10,8 @@ def extract(pdfs, opts)
       [pdfs].flatten.each do |pdf|
         pdf_name = File.basename(pdf, File.extname(pdf))
         page_path = ESCAPE[File.join(@output, "#{pdf_name}")] + "_%d.pdf"
-        FileUtils.mkdir_p @output unless File.exists?(@output)
-        
+        FileUtils.mkdir_p @output unless File.exist?(@output)
+
         cmd = if DEPENDENCIES[:pdftailor] # prefer pdftailor, but keep pdftk for backwards compatability
           "pdftailor unstitch --output #{page_path} #{ESCAPE[pdf]} 2>&1"
         else
diff --git a/lib/docsplit/text_extractor.rb b/lib/docsplit/text_extractor.rb
index 93973f6..27e4d29 100644
--- a/lib/docsplit/text_extractor.rb
+++ b/lib/docsplit/text_extractor.rb
@@ -28,7 +28,7 @@ def initialize
     # Extract text from a list of PDFs.
     def extract(pdfs, opts)
       extract_options opts
-      FileUtils.mkdir_p @output unless File.exists?(@output)
+      FileUtils.mkdir_p @output unless File.exist?(@output)
       [pdfs].flatten.each do |pdf|
         @pdf_name = File.basename(pdf, File.extname(pdf))
         pages = (@pages == 'all') ? 1..Docsplit.extract_length(pdf) : @pages

From 6127e3912b8db94ed84dca6be5622d3d5ec0d879 Mon Sep 17 00:00:00 2001
From: = <=>
Date: Fri, 5 May 2023 13:36:07 -0700
Subject: [PATCH 6/6] FIx more instances of deprecated File.exists?

---
 lib/docsplit/image_extractor.rb |  4 ++--
 lib/docsplit/page_extractor.rb  |  2 +-
 lib/docsplit/pdf_extractor.rb   | 30 +++++++++++++++---------------
 lib/docsplit/text_extractor.rb  |  2 +-
 test/test_helper.rb             |  2 +-
 5 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/lib/docsplit/image_extractor.rb b/lib/docsplit/image_extractor.rb
index 8c29bbc..8bc4d1d 100755
--- a/lib/docsplit/image_extractor.rb
+++ b/lib/docsplit/image_extractor.rb
@@ -33,7 +33,7 @@ def convert(pdf, size, format, previous=nil)
       directory = directory_for(size)
       pages     = @pages || '1-' + Docsplit.extract_length(pdf).to_s
       escaped_pdf = ESCAPE[pdf]
-      FileUtils.mkdir_p(directory) unless File.exists?(directory)
+      FileUtils.mkdir_p(directory) unless File.exist?(directory)
       common    = "#{MEMORY_ARGS} -density #{@density} #{resize_arg(size)} #{quality_arg(format)}"
       if previous
         FileUtils.cp(Dir[directory_for(previous) + '/*'], directory)
@@ -48,7 +48,7 @@ def convert(pdf, size, format, previous=nil)
         end
       end
     ensure
-      FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
+      FileUtils.remove_entry_secure tempdir if File.exist?(tempdir)
     end
 
 
diff --git a/lib/docsplit/page_extractor.rb b/lib/docsplit/page_extractor.rb
index cb84e34..0aef939 100644
--- a/lib/docsplit/page_extractor.rb
+++ b/lib/docsplit/page_extractor.rb
@@ -18,7 +18,7 @@ def extract(pdfs, opts)
           "pdftk #{ESCAPE[pdf]} burst output #{page_path} 2>&1"
         end
         result = `#{cmd}`.chomp
-        FileUtils.rm('doc_data.txt') if File.exists?('doc_data.txt')
+        FileUtils.rm('doc_data.txt') if File.exist?('doc_data.txt')
         raise ExtractionFailed, result if $? != 0
         result
       end
diff --git a/lib/docsplit/pdf_extractor.rb b/lib/docsplit/pdf_extractor.rb
index 21861e2..a479265 100644
--- a/lib/docsplit/pdf_extractor.rb
+++ b/lib/docsplit/pdf_extractor.rb
@@ -16,7 +16,7 @@ def osx?
     def linux?
       !!HOST_OS.match(/linux/i)
     end
-    
+
     # The first line of the help output holds the name and version number
     # of the office software to be used for extraction.
     def version_string
@@ -35,10 +35,10 @@ def libre_office?
     def open_office?
       !!version_string.match(/^OpenOffice.org/)
     end
-    
+
     # A set of default locations to search for office software
     # These have been extracted from JODConverter.  Each listed
-    # path should contain a directory "program" which in turn 
+    # path should contain a directory "program" which in turn
     # contains the "soffice" executable.
     # see: https://github.com/mirkonasato/jodconverter/blob/master/jodconverter-core/src/main/java/org/artofsolving/jodconverter/office/OfficeUtils.java#L63-L91
     def office_search_paths
@@ -69,7 +69,7 @@ def office_search_paths
       end
       search_paths
     end
-    
+
     # Identify the path to a working office executable.
     def office_executable
       paths = office_search_paths
@@ -78,10 +78,10 @@ def office_executable
       # raise an error if that path isn't valid, otherwise, add
       # it to the front of our search paths.
       if ENV['OFFICE_PATH']
-        raise ArgumentError, "No such file or directory #{ENV['OFFICE_PATH']}" unless File.exists? ENV['OFFICE_PATH']
+        raise ArgumentError, "No such file or directory #{ENV['OFFICE_PATH']}" unless File.exist? ENV['OFFICE_PATH']
         paths.unshift(ENV['OFFICE_PATH'])
       end
-      
+
       # The location of the office executable is OS dependent
       path_pieces = ["soffice"]
       if windows?
@@ -91,15 +91,15 @@ def office_executable
       else
         path_pieces += [["program", "soffice"]]
       end
-      
+
       # Search for the first suitable office executable
       # and short circuit an executable is found.
       paths.each do |path|
-        if File.exists? path
+        if File.exist? path
           @@executable ||= path unless File.directory? path
           path_pieces.each do |pieces|
             check_path = File.join(path, pieces)
-            @@executable ||= check_path if File.exists? check_path
+            @@executable ||= check_path if File.exist? check_path
           end
         end
         break if @@executable
@@ -107,16 +107,16 @@ def office_executable
       raise OfficeNotFound, "No office software found" unless @@executable
       @@executable
     end
-    
+
     # Used to specify the office location for JODConverter
     def office_path
       File.dirname(File.dirname(office_executable))
     end
-    
+
     # Convert documents to PDF.
     def extract(docs, opts)
       out = opts[:output] || '.'
-      FileUtils.mkdir_p out unless File.exists?(out)
+      FileUtils.mkdir_p out unless File.exist?(out)
       [docs].flatten.each do |doc|
         ext = File.extname(doc)
         basename = File.basename(doc, ext)
@@ -128,7 +128,7 @@ def extract(docs, opts)
           if libre_office?
             # Set the LibreOffice user profile, so that parallel uses of cloudcrowd don't trip over each other.
             ENV['SYSUSERCONFIG']="file://#{File.expand_path(escaped_out)}"
-            
+
             options = "--headless --invisible  --norestore --nolockcheck --convert-to pdf --outdir #{escaped_out} #{escaped_doc}"
             cmd = "#{office_executable} #{options} 2>&1"
             result = `#{cmd}`.chomp
@@ -147,9 +147,9 @@ def extract(docs, opts)
     LOGGING       = "-Djava.util.logging.config.file=#{ESCAPED_ROOT}/vendor/logging.properties"
 
     HEADLESS      = "-Djava.awt.headless=true"
-    
+
     private
-    
+
     # Runs a Java command, with quieted logging, and the classpath set properly.
     def run_jod(command, pdfs, opts, return_output=false)
 
diff --git a/lib/docsplit/text_extractor.rb b/lib/docsplit/text_extractor.rb
index 27e4d29..f3390e8 100644
--- a/lib/docsplit/text_extractor.rb
+++ b/lib/docsplit/text_extractor.rb
@@ -80,7 +80,7 @@ def extract_from_ocr(pdf, pages)
         clean_text(base_path + '.txt') if @clean_ocr
       end
     ensure
-      FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
+      FileUtils.remove_entry_secure tempdir if File.exist?(tempdir)
     end
 
 
diff --git a/test/test_helper.rb b/test/test_helper.rb
index 9c37b2b..2357c5a 100755
--- a/test/test_helper.rb
+++ b/test/test_helper.rb
@@ -10,7 +10,7 @@ class Minitest::Test
   OUTPUT = 'test/output'
 
   def clear_output
-    FileUtils.rm_r(OUTPUT) if File.exists?(OUTPUT)
+    FileUtils.rm_r(OUTPUT) if File.exist?(OUTPUT)
   end
 
   def teardown