/usr/bin/ocr4gamera

#!/usr/bin/python
#
# Copyright (C) 2009-2010 Rene Baston, Christoph Dalitz
#               2014      Fabian Schmitt
#               2011-2014 Christoph Dalitz
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
#

import codecs #keep an eye on encoding stuff...  http://evanjones.ca/python-utf8.html
import sys
import time
import os.path

VERSION = "1.2.0"

def usage(returncode):
  sys.stdout.write("Usage:\n\tocr4gamera -x <traindata> [options] <imagefile>\n" +\
      "Options (can be short or long):\n" +\
      "\t-v <int>, --verbosity=<int>\n" + \
      "\t   set verbosity level to <int>; possible values are\n" + \
      "\t   0 (default): silent operation\n" + \
      "\t   1:  information on progress\n" + \
      "\t   >2: segmentation info is written to PNG files with prefix 'debug_'\n" +\
      "\t-h, --help\n" + \
      "\t   this help message\n" +\
      "\t--version\n" + \
      "\t   print version and exit\n" +\
      "\t-d, --deskew\n" + \
      "\t   do a skew correction (recommended)\n" +\
      "\t-mf <ws>, --median_filter=<ws>\n" +\
      "\t   smooth the input image with a median filter with window size <ws>\n" +\
      "\t   default is <ws>=0, which means no smoothing\n" +\
      "\t-ds <s>, --despeckle=<s>\n" +\
      "\t   remove all speckle with size <= <s>\n" +\
      "\t   default is <s> = 0, which means no despeckling\n" +\
      "\t-f, --filter\n" + \
      "\t   filter out connected components that are very big or very small\n" +\
      "\t-a, --automatic_group\n" + \
      "\t   autogroup glyphs with classifier\n" +\
      "\t-x <xml>, --xmlfile=<xml>\n" + \
      "\t   read training data from <xml>\n" +\
      "\t-k <k>, --k=<k>\n" + \
      "\t   number of neighbors used by kNN classifier (default is <k> = 1)\n" +\
      "\t-o <txt>, --output=<txt>\n" + \
      "\t   write recognized text to file <txt>\n" + \
      "\t   (otherwise it is written to stdout)\n" +\
      "\t-od <dir>, --output_directory=<dir>\n" + \
      "\t   writes for each input image <img> the recognized text to '<dir>/<img>.txt\n" +\
      "\t   note that this option cannot be used in combination with -o (--outfile)\n" + \
      "\t   (otherwise it is written to stdout)\n" +\
      "\t-c <csv>, --extra_chars_csvfile=<csv>\n" + \
      "\t   read additional class name conversions from file <csv>\n" + \
      "\t   <csv> must contain one conversion per line\n" +\
      "\t-R <rules>, --heuristic_rules=<rules>\n" + \
      "\t   apply heuristic rules <rules> for disambiguation of some chars\n" + \
      "\t   <rules> can be 'roman' (default) or 'none' (for no rules)\n" +\
      "\t-D, --dictionary_correction\n" + \
      "\t   dictionary correction (requires aspell or ispell)\n" +\
      "\t-L <lang>, --dictionary_language=<lang>\n" + \
      "\t   language to be used by aspell (when option -D is set)\n" +\
      "\t-e <int>, --edit_distance=<int>\n" + \
      "\t   dictionary correct only when edit distance not more than <int>\n" + \
      "\t-ho, --hocr_out\n" +\
      "\t    writes output as hocr file (only works with the -o option)\n" + \
      "\t-hi <hocrfile>, --hocr_in=<hocrfile>\n" +\
      "\t   uses an hocr input file for textline segmentation\n" )
  sys.exit(returncode)

def correct(sentence, lang):
  import os
  from gamera.plugins.structural import edit_distance
  from popen2 import Popen3
  correct="\*"
  incorrect="&"
  #trim_signs = '.,!?;:"'
  trim_signs = ('.',',','!','?',';',':','"')
  spell_prog = 'aspell'
  lang_opt = '-l'
  new_sentence = ""
  words = sentence.split(" ")
  if(len(words) == 0):
    return sentence
  
  p = Popen3('%s' % spell_prog, True)
  if opt.verbosity:
    print 'Using %s for word-correction.\n' % spell_prog
  if p.childerr.readlines() != []:
    if opt.verbosity:
      print '% is not installed\n' % spell_prog
    spell_prog = 'ispell'
    if opt.verbosity:
      print 'Using % for word-correction.\n' % spell_prog
    lang_opt = '-d'
    p = Popen3('%s Q' % spell_prog, True)
    if  p.childerr.readlines() != ['ispell:  specified file does not exist\n']:
      print 'Wether aspell nor ispell is installed on your system. Please make sure to install either of this programs.'
      exit
  
  # open with local setting language
  if (opt.lang == ''):
    if opt.verbosity:
      if spell_prog == 'aspell':
	print 'No language was given. Will open aspell with locale-settings language.\n'
      if spell_prog == 'ispell':
	print 'No language was given. Will open ispell with default language.\n'
    p = Popen3('%s -a' % spell_prog, True) # True is for also storing error object in return-value
  # user chosen language  
  else:
    p = Popen3('%s -a %s %s' % (spell_prog, lang_opt, lang), True)
  

  out = p.fromchild.readline() # first line gives information about programm
  if (out == '' ): #something went wrong
    print p.childerr.readlines()
    exit
      
  word_count = len(words)
  for word in words:
    #word = word.strip(trim_signs)
    sign = ""
    if word.endswith(trim_signs):
      sign = word[-1:]
      word = word[:-1]
    word_count = word_count - 1
    if(correct_this(word)):
      p.tochild.write('%s\n' % word.encode('utf-8'))
      p.tochild.flush()
      out = p.fromchild.readline()
      while (out=='\n'):
	out = p.fromchild.readline()
      
      if(out[0] == '*'): #spell_prog says: word correct
	new_sentence = new_sentence + word +sign
	if(word_count):
	  new_sentence = new_sentence + " "
	continue
      elif(out[0] == '&'): #spell_prog says: word incorrect
	out = out.split(" ")
	if edit_distance(word, out[4][:-1]) <= opt.distance:
	  word = out[4][:-1].decode('utf-8')
	elif opt.verbosity:
	  print('%d. word: \'%s\' was not corrected to \'%s\'. ' 
	  'Edit_distance: %i is larger than distance: %i.\n' 
	  % (len(words)-word_count, word, out[4][:-1],
	     edit_distance(word, out[4][:-1]), opt.distance))
	
    new_sentence = new_sentence + word + sign
    if(word_count):
      new_sentence = new_sentence + " "
  return new_sentence
	
    
def correct_this(word):
  for character in word:
    if(character == "-"):
      return False
    if(character == "[" or character == "]"):
      return False    
    if(character.isdigit()):
      return False
  if(word == word.upper()):
      return False
  return True

def line_to_hocr(line, nr):
	id_s = "     <span class='ocr_line' id='line_" + str(nr) + "' "
	bbox_s = 'title="bbox ' + str(line.bbox.ul.x) + " " + str(line.bbox.ul.y) + " " + str(line.bbox.lr.x) + " " + str(line.bbox.lr.y) + '">'
	text = ""
	for word in line.words:
		word_s = "<span class='ocrx_word' id='word_'" + str(line.words.index(word)) + "' " + 'title="bbox ' + str(word[0].ul.x) + " " + str(word[0].ul.y) + " " + str(word[0].lr.x) + " " + str(word[0].lr.y) + '">'
		word_s += line.text.split(" ")[line.words.index(word)]
		text += word_s + " </span>"
	end = "<br></span>\n"
	return id_s + bbox_s + text + end
	

class Options():    
  def __init__(self):
    self.help = False
    self.deskew = False
    self.ccsfilter = False
    self.auto_group = False
    self.dict_correct = False
    self.hocr_out = False

    self.hocr_in = ""
    self.verbosity = 0
    self.outputfile = ""
    self.outputdirectory = ""
    self.trainfile = ""
    self.lang = ""
    self.distance = 2
    self.extra_chars_csvfile = ""
    self.heuristic_rules = "roman"
    self.median_size = 0
    self.speckle_size = 0
    self.k = 1

#
# here starts the main program
#
opt = Options()
args = sys.argv[1:]
imagefiles = []
extra_chars_dict = {}


if(len(args) == 0):
  usage(1)

i =0
while i< len(args):
  # options without second parameter
  if args[i] in ("-h", "--help"):
    usage(0)
  if args[i] == "--version":
    print VERSION
    sys.exit(0)
  elif args[i] in ("-d", "--deskew"):
    opt.deskew = True
  elif args[i] in ("-f", "--filter"):
    opt.ccsfilter = True
  elif args[i] in ("-a", "--automatic_group"):
    opt.auto_group = True
  elif args[i] in ("-D", "--dictionary_correction"):
    opt.dict_correct = True
  elif args[i] in ("-ho"):
    opt.hocr_out = True
  # options with second parameter
  # verbosity level
  elif args[i] == "-hi":
    i+=1
    opt.hocr_in = args[i]
  elif args[i].startswith("--hocr_in="):
    opt.hocr_in = args[i][len("--hocr_in="):]
  elif args[i] in ("-v"):
    i+=1
    opt.verbosity = int(args[i])
  elif args[i].startswith("--verbosity="):
    opt.verbosity = int(args[i][len("--verbosity="):])
  # output file name
  elif args[i] in ("-o"):
    i+=1
    opt.outputfile = args[i]
  elif args[i].startswith("--output="):
    opt.outputfile = args[i][len("--output="):]
  # output directory
  elif args[i] in ("-od"):
    i+=1
    opt.outputdirectory = args[i]
  elif args[i].startswith("--output_directory="):
    opt.outputdirectory = args[i][len("--output_directory="):]
  # training data file
  elif args[i] in ("-x"):
    i+=1
    opt.trainfile = args[i]
  elif args[i].startswith("--xmlfile="):
    opt.trainfile = args[i][len("--xmlfile="):]
  # k for kNN
  elif args[i] in ("-k"):
    i+=1
    opt.k = int(args[i])
  elif args[i].startswith("--k="):
    opt.k = int(args[i][len("--k="):])
  # median filter size
  elif args[i] in ("-mf"):
    i+=1
    opt.median_size = int(args[i])
  elif args[i].startswith("--median_size="):
    opt.median_size = int(args[i][len("--median_size="):])
  # speckle size for despeckling
  elif args[i] in ("-ds"):
    i+=1
    opt.speckle_size = int(args[i])
  elif args[i].startswith("--despeckle="):
    opt.speckle_size = int(args[i][len("--despeckle="):])
  # dictionary language
  elif args[i] in ("-L"):
    i+=1
    opt.lang = args[i]
  elif args[i].startswith("--dictionary_language="):
    opt.lang = args[i][len("--dictionary_language="):]
  # edit distance for dictionary lookup
  elif args[i] in ("-e"):
    i+=1
    opt.distance = int(args[i])
  elif args[i].startswith("--edit_distance="):
    opt.distance = int(args[i][len("--edit_distance="):])
  # additional translations classname -> character
  elif args[i] in ("-c"):
    i+=1
    opt.extra_chars_csvfile = args[i] 
  elif args[i].startswith("--extra_chars_csvfile="):
    opt.extra_chars_csvfile = args[i][len("--extra_chars_csvfile="):]
  # heuristic disambiguation rules
  elif args[i] in ("-R"):
    i+=1
    opt.heuristic_rules = args[i].lower()
  elif args[i].startswith("--heuristic_rules="):
    opt.heuristic_rules = args[i][len("--heuristic_rules="):].lower()
  # unknown option
  elif args[i][0] == '-':
    print "Error: option %s does not exist" % args[i]
    usage(1)
  else:
    # we assume it is an imagefile
    imagefiles.append(args[i])
  i+=1

# some plausibility checks
if opt.trainfile == "":
  sys.stderr.write("Error: no training data given\n")
  sys.exit(1)
  
if len(imagefiles) == 0:
  sys.stderr.write("Error: no image file given\n")
  sys.exit(1)

if len(imagefiles) > 1 and opt.outputdirectory == "":
  sys.stderr.write("Error: for multiple image files option -od (--output_directory) must be given\n")
  sys.exit(1)

if opt.outputdirectory != "" and not os.path.isdir(opt.outputdirectory):
  sys.stderr.write("Error: output directory '" + opt.outputdirectory +"' is not a proper directory\n")
  sys.exit(1)

for imagefile in imagefiles:
  if not os.path.exists(imagefile):
    sys.stderr.write("Error: image file '" + imagefile + "' not found\n")
    sys.exit(1)

if not(opt.hocr_in == "") and not(opt.outputdirectory == ""):
  sys.stderr.write("hocr-input doesn't works with -od option\n")
  sys.exit(1)
  
if opt.hocr_out and opt.outputdirectory == "" and opt.outputfile == "":
  sys.stderr.write("hocr-output does only works with an output option\n")
  sys.exit(1)

# we import Gamera after parsing the command line arguments so that
# in case of a command line error the script can be aborted beforehand
from gamera.core import *
init_gamera()    
from gamera import knn   
from gamera.plugins import pagesegmentation
from gamera.plugins.pagesegmentation import textline_reading_order
from gamera.classify import ShapedGroupingFunction
from gamera.plugins.image_utilities import union_images
from gamera.toolkits.ocr.ocr_toolkit import *
from gamera.toolkits.ocr.classes import Textline,ClassifyCCs,Page,hocrPage



# load trainingsdata only once for all images
cknn = knn.kNNInteractive([], ["aspect_ratio", "fourier_broken", "moments", "volume64regions", "nholes_extended"], 0)

if opt.k > 0:
    cknn.num_k = opt.k
cknn.from_xml_filename(opt.trainfile)

# loop over all input images
for imagefile in imagefiles:

    if opt.verbosity > 0:
        print "processing file '" + imagefile + "' ..."

    img = load_image(imagefile)
    if img.data.pixel_type != ONEBIT:
      img = img.to_onebit()

    if opt.outputdirectory != "":
        opt.outputfile = os.path.join(opt.outputdirectory, os.path.basename(imagefile) + ".txt")
    
    if opt.extra_chars_csvfile != "":
      f = codecs.open(opt.extra_chars_csvfile, "r", encoding='utf-8')

      for line in f:
        classname, char = line.split(',', 2)[:2]
        classname = classname.strip()
        char = char.strip("\n\r")
        extra_chars_dict[classname] = char

      f.close()

    if opt.median_size > 0:
        img = img.rank((opt.median_size*opt.median_size+1)/2, opt.median_size)

    if opt.speckle_size > 0:
        img.despeckle(opt.speckle_size)

    if opt.ccsfilter:
        ccs = img.cc_analysis()
        print "filter started on",len(ccs) ,"elements..."
        median_black_area = median([cc.black_area()[0] for cc in ccs])
        newccs = []
        for cc in ccs:
          if cc.black_area()[0] > (median_black_area * 10):
            cc.fill_white()
          else:
            new_ccs.append(cc)
        for cc in ccs:
          if cc.black_area()[0] < (median_black_area / 10):
            cc.fill_white()
          else:
            new_ccs.append(cc)
        print "filter done:", len(ccs)-len(newccs), "of", len(ccs), "CCs deleted."
        ccs = new_ccs

    if opt.deskew:
      if opt.verbosity > 0:
        print "\ntry to skew correct..."
      rotation = img.rotation_angle_projections(-10,10)[0]
      img = img.rotate(rotation,0)
      if opt.verbosity > 0:
        print "rotated with",rotation,"angle"

    if opt.auto_group:
      if(opt.ccsfilter):
        the_ccs = ccs
      else:
        the_ccs = img.cc_analysis()
      median_cc = int(median([cc.nrows for cc in the_ccs]))
      autogroup = ClassifyCCs(cknn)
      autogroup.parts_to_group = 3
      autogroup.grouping_distance = max([2,median_cc / 8])
      if opt.hocr_in == "":
	    p = Page(img, classify_ccs=autogroup)
      else:
        p = hocrPage(img, opt.hocr_in, classify_ccs=autogroup)
      img.reset_onebit_image()
      if opt.verbosity > 0:
        print "autogrouping glyphs activated."
        print "maximal autogroup distance:", autogroup.grouping_distance
    else:
      if opt.hocr_in == "":
        p = Page(img)
      else:
        p = hocrPage(img, opt.hocr_in)

    if opt.verbosity > 0:
      print "start page segmentation..."
      t = time.time()

    p.segment()

    if opt.verbosity > 0:
      t = time.time() - t
      print "\t segmentation done [",t,"sec]"

    if opt.verbosity > 1:
      rgbfilename = "debug_lines.png"
      rgb = p.show_lines()
      rgb.save_PNG(rgbfilename)
      print "file '%s' written" % rgbfilename
      rgbfilename = "debug_chars.png"
      rgb = p.show_glyphs()
      rgb.save_PNG(rgbfilename)
      print "file '%s' written" % rgbfilename
      rgbfilename = "debug_words.png"
      rgb = p.show_words()
      rgb.save_PNG(rgbfilename)
      print "file '%s' written" % rgbfilename

    if opt.outputfile == "":
      sys.stdout = codecs.getwriter('utf-8')(sys.stdout)

    if opt.hocr_out:
      opt.outputfile += ".html"
      f = codecs.open(opt.outputfile, "a", "utf-8")
      start_text = '''<html>
 <head>
  <meta charset="utf-8"/>
  <title></title>
  <meta />
 </head>
 <body>
  <div class='ocr_page' id='page_1' title='image "''' + imagefile + '"; bbox ' + str(img.ul.x) + " " + str(img.ul.y) + " " + str(img.lr.x) + " " + str(img.lr.y) + """; ppageno 0'>
"""
      f.write(start_text)
      f.flush()
      f.close()

    for line in p.textlines:
      if opt.ccsfilter:
        if(len(line.glyphs) < 2): #a line with one or no glyph is useless
          continue

      cknn.classify_list_automatic(line.glyphs)

      if(opt.ccsfilter):	#lines with a median confidence lower than 0.005 should be useless too
        if(median([glyph.get_confidence() for glyph in line.glyphs]) < 0.005):
          continue

      line.sort_glyphs()
      line.text =  textline_to_string(line, heuristic_rules=opt.heuristic_rules, extra_chars_dict=extra_chars_dict)
      if opt.dict_correct:
        line.text = correct(line.text, opt.lang)
      line_text = line.text

      if opt.outputfile != "":
        f = codecs.open(opt.outputfile, "a", "utf-8")
        if not opt.hocr_out:
            line_text = line_text + "\n"
        else:
            line_text = line_to_hocr(line, p.textlines.index(line))
        f.write(line_text)
        f.flush()
        f.close()
      else:
        print line_text


    if opt.hocr_out:
        f = codecs.open(opt.outputfile, "a", "utf-8")
        end_text = """  </div>
 </body>
</html>
"""
        f.write(end_text)
        f.flush()
        f.close()
		
    if opt.verbosity > 0 and opt.outputfile != "":
      print "text has been written to file", opt.outputfile
python-gamera.toolkits.ocr 1.2.2-3 / usr / bin / ocr4gamera