Commit 87ced199 authored by Mattia Bondanza's avatar Mattia Bondanza

Improved the handling of temporary directory. Now default is /tmp/BAO and you...

Improved the handling of temporary directory. Now default is /tmp/BAO and you can set a different one with --tmp-directory option.
parent 885c24c7
......@@ -12,11 +12,15 @@ parser = argparse.ArgumentParser( description="BAO takes a directory that contai
"analyzes them, to tag the files with the correct metadata." )
parser.add_argument( 'directory', nargs = 1, type=str, help='The directory that contains the'
' pdf file to analyze', metavar='/directory/' )
' pdf file to analyze', metavar='DIRECTORY' )
parser.add_argument( '-p', '--pattern', nargs = 1,
dest='pattern', type=str, required=False, default='*', help='The pattern the file have to match to be'
' analyzed', metavar='path' )
dest='pattern', type=str, required=False, default=['*'], help='The pattern the file have to match to be'
' analyzed', metavar='PATTERN' )
parser.add_argument( '--tmp-directory', nargs = 1, dest='tmp_directory', type=str, required=False,
default=['/tmp/BAO'], help='The directory where all temporary files required by the script are'
' created.' , metavar='TMP_DIRECTORY' )
args = parser.parse_args()
......@@ -28,6 +32,9 @@ metadata = {}
books = []
if not os.path.exists( args.tmp_directory[0] ):
os.makedirs( args.tmp_directory[0] )
for path, subdirs, files in os.walk(args.directory[0]):
for name in files:
if fnmatch(name, args.pattern[0]):
......@@ -104,7 +111,7 @@ for path, subdirs, files in os.walk(args.directory[0]):
print "[ISBNsrch]\tFailed extracting text from metadata."
print "[ISBNsrch]\tTrying extracting text with slate."
try:
cnt = getTextWithSlate( os.path.join(path, name) )
cnt = getTextWithSlate( os.path.join(path, name), temporary_file_directory = args.tmp_directory[0] )
#Frequently slater returns strings with a lot of chr(12) for pdf with no text layer
#instead of ''.
if len( cnt.replace(chr(12), '' ) ) != 0:
......@@ -120,7 +127,7 @@ for path, subdirs, files in os.walk(args.directory[0]):
print "[ISBNsrch]\tNo text layer."
print "[ISBNsrch]\tExecuting OCR on first and last 10 pages..."
try:
cnt = getTextWithOCR( os.path.join(path, name) )
cnt = getTextWithOCR( os.path.join(path, name), temporary_file_directory = args.tmp_directory[0] )
if len( cnt ) != 0:
print "[ISBNsrch]\tText layer extracted."
except:
......@@ -145,6 +152,8 @@ for path, subdirs, files in os.walk(args.directory[0]):
#Close the file
f.close()
os.removedirs( args.tmp_directory[0] )
print "*****FILE TYPE ANALYSIS*****"
for types in type_stat.keys():
print "* ", int( type_stat[types] ), "/", int( total_files ), "(", round(type_stat[types] / total_files * 100, 1), "%) are ", types
......
......@@ -8,6 +8,8 @@ import xmltodict as xmlparser
import slate
from pypdfocr import pypdfocr as pypdfocr
from time import sleep
import isbnPLUS
import OpenLibrary
......@@ -23,24 +25,28 @@ def getPDFContent(path):
def getTextFromMetadata( path ):
return getPDFContent(path).encode("ascii", "ignore")
def getTextWithSlate( path ):
stripFirstAndLast10Pages( path, '/tmp/stripped.pdf' )
f = open('/tmp/stripped.pdf', "r")
def getTextWithSlate( path, temporary_file_directory = '/tmp' ):
tmp_file_path = temporary_file_directory + '/stripped.pdf'
stripFirstAndLast10Pages( path, tmp_file_path )
sleep(1000)
f = open(tmp_file_path, "r")
texts=slate.PDF(f)
cnt = ""
for pg in texts:
cnt+=pg
f.close()
os.remove('/tmp/stripped.pdf')
os.remove(tmp_file_path)
return cnt
def getTextWithOCR( path ):
stripFirstAndLast10Pages( path, '/tmp/stripped.pdf' )
def getTextWithOCR( path, temporary_file_directory = '/tmp' ):
tmp_file_path = temporary_file_directory + '/stripped.pdf'
stripFirstAndLast10Pages( path, tmp_file_path )
ocr = pypdfocr.PyPDFOCR()
ocr.go( ['/tmp/stripped.pdf'] )
os.remove('/tmp/stripped.pdf')
cnt = getTextFromMetadata( '/tmp/stripped_ocr.pdf' )
os.remove('/tmp/stripped_ocr.pdf')
ocr.go( [tmp_file_path] )
os.remove(tmp_file_path)
tmp_ocr_path = emporary_file_directory + '/stripped_ocr.pdf'
cnt = getTextFromMetadata( tmp_ocr_path )
os.remove( tmp_ocr_path )
return cnt
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment