Commit 8c22da53 authored by Mattia Bondanza's avatar Mattia Bondanza

Added the ISBN research in the PDF in a very basic way. API of ISBNPLUS not implemented yet.

parent 60f287e5
......@@ -3,6 +3,7 @@ import os
import magic
from fnmatch import fnmatch
from pyPdf import PdfFileReader
from searchISBN import *
root = '/home/mattia/Nextcloud/Università/Libreria/'
pattern = "*"
......@@ -11,6 +12,8 @@ type_stat = {}
total_files = 0.0
metadata = {}
books = []
for path, subdirs, files in os.walk(root):
for name in files:
if fnmatch(name, pattern):
......@@ -26,6 +29,9 @@ for path, subdirs, files in os.walk(root):
#file metadata analysis
if ftype == 'application/pdf':
print "Analyzing " + os.path.join(path, name) + " ..."
current_book = {}
current_book['path'] = os.path.join(path, name)
f = open( os.path.join(path, name), "rb" )
pdf_toread = PdfFileReader( f )
if pdf_toread.isEncrypted:
......@@ -36,6 +42,20 @@ for path, subdirs, files in os.walk(root):
metadata[mdt] += 1
else:
metadata[mdt] = 1.0
if '/Author' in pdf_info.keys():
current_book['author'] = pdf_info['/Author']
else :
current_book['author'] = None
if '/Title' in pdf_info.keys():
current_book['title'] = pdf_info['/Title']
else :
current_book['title'] = None
ISBNstrings = searchISBNstrings(os.path.join(path, name) )
current_book['isbn'] = ISBNstrings
books.append(current_book)
f.close()
print "*****FILE TYPE ANALYSIS*****"
......@@ -46,5 +66,34 @@ print "*****FILE METADATA ANALYSIS*****"
for mdt in metadata.keys():
if mdt == '/Author' or mdt == '/Title':
print "\x1b[31m",
print "* ", int( metadata[mdt] ) , "/", int( type_stat['application/pdf'] ), "(", round(metadata[mdt]/type_stat['application/pdf'] * 100, 1), "%) are tagged with", mdt,s
print "* ", int( metadata[mdt] ) , "/", int( type_stat['application/pdf'] ), "(", round(metadata[mdt]/type_stat['application/pdf'] * 100, 1), "%) are tagged with", mdt,
print "\x1b[0m"
print "********************************"
print
for book in books:
print book['path']
print "\tAuthor:", book['author']
print "\tTitle:", book['title']
print "\tFound ISBN: "
if book['isbn'] is not None:
for isbn in book['isbn']:
#This function is evil. It shouldn't be used.
mdt = evilMetadataFromISBN( isbn )
if mdt['author'] is not '' and mdt['title'] is not '':
print "\x1b[34m",
else:
print "\x1b[31m",
print "\t\t",isbn,
print "\x1b[0m",
if mdt['author'] is not '' and mdt['title'] is not '':
print "( A:\x1b[33m", mdt['author'], "\x1b[0mT:\x1b[32m", mdt['title'], "\x1b[0m)",
#Idee per il consistecy check del libro :
# verifica che il numero di pagine corrisponda ( +- 5% )
# cerca il titolo del libro nel libro stesso e guarda quante ricorrenze ci sono
# cerca il nome dell'autore nel libro e guarda quante ricorrenze ci sono
# se ci sono più ISBN verifica che non siano le versioni 10 e 13 dello stesso libro.
print
import PyPDF2
import pycurl
import cStringIO
import json
import sys
import re
import commands
ISBNPLUS_APP_ID = 'd6303441'
ISBNPLUS_APP_KEY = '07b60ccae2f7f2bb0e8acbc1dbbeb540'
def getPDFContent(path):
content = ""
num_pages = 10
inpdf = PyPDF2.PdfFileReader(path)
for i in range(0, num_pages):
content += inpdf.getPage(i).extractText() + "\n"
content = " ".join(content.replace(u"\xa0", " ").strip().split())
return content
def isValidISBNAnsware( metadata ):
try:
metadata = json.loads( answare )
except:
return False
return True
def evilMetadataFromISBN( isbn ):
"""To avoid to use the official API (it require some time for activation),
I simply download the whole page (it's not a true research, I just suppose that
the page exist..) and search for the bibtex entry. It is evil and stupid, use
the API."""
mdt = {}
answare= commands.getoutput("curl \'http://isbnplus.com/"+isbn+"\' -s")
pos = answare.find("author={")
d=0
if pos != -1:
while answare[pos+d] != '}':
d = d+1
mdt['author'] = answare[pos+8:pos+d]
pos = answare.find("title={")
d=0
if pos != -1:
while answare[pos+d] != '}':
d = d+1
mdt['title'] = answare[pos+7:pos+d]
return mdt
def metadataFromISBN( isbn ):
buf = cStringIO.StringIO()
c = pycurl.Curl()
c.setopt(c.URL, 'https://api-2445581351187.apicast.io/search?q='+isbn+'&app_id='+ISBNPLUS_APP_ID+'&app_key='+ISBNPLUS_APP_KEY)
c.setopt(c.WRITEFUNCTION, buf.write)
c.perform()
#print buf.getvalue()
answare = buf.getvalue()
if isValidISBNPLUSAnsware( answare ):
##
buf.close()
return metadata
else:
return None
def metadataISBNGetAuthor( mdt ):
if 'author' in mdt['list'][0]:
return mdt['list'][0]['author']
else:
return None
def metadataISBNGetTitle( mdt ):
if 'title' in mdt['list'][0]:
return mdt['list'][0]['title']
else:
return None
def searchISBNstrings( path ):
try:
cnt = getPDFContent(path).encode("ascii", "ignore")
except:
#print "Can't get the text from PDF."
return None
#print cnt
bg = 0
out = []
pos = cnt.find("ISBN")
while pos != -1 :
#print cnt[pos:pos+23],
strippedisbn = re.sub("\D", "", cnt[pos:pos+23])
if not any( strippedisbn in s for s in out ):
out.append(strippedisbn)
bg = pos
pos = cnt.find("ISBN", bg+1 )
if len( out ) == 0:
return None
else:
return out
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment