#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Created on Thu Oct  8 10:39:48 2015

Tool that attempts to produce BibTex files for PDF.

@author: mints
"""
import sys
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
from pdfminer.layout import LAParams
import re
from ads import SearchQuery
from arxiv2bib import arxiv2bib

class PDFScanner(object):
    def __init__(self, name):
        self.name = name
        self.parser = PDFParser(open(name, 'r'))
        self.doc = PDFDocument(self.parser)

    def get_info(self):
        return self.doc.info

    def get_uri(self):
        uri_count = 0
        uri = None
        for obj in self.doc.xrefs[0].offsets.keys():
            xref = self.doc.getobj(obj)
            if type(xref) == dict:
                if 'A' in xref and type(xref['A']) == dict:
                    if 'URI' in xref['A']:
                        if xref['A']['URI'].startswith('http'):
                            #print xref['A']['URI'].split('/')
                            uri = xref['A']['URI'].split('/')[-1]
                            uri_count = uri_count + 1
        if uri_count == 1:
            return uri
        return None

    def get_first_page(self):
        rsrcmgr = PDFResourceManager(caching=False)
        outfp = open('page.tmp', 'w')
        laparams = LAParams()
        laparams.all_texts = True
        laparams.detect_vertical = True
        device = TextConverter(rsrcmgr, outfp, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        with open(self.name, 'r') as fp:
            for page in PDFPage.get_pages(fp, [0], caching=False):
                  interpreter.process_page(page)
        device.close()
        outfp.close()

    def extract_search(self):
        result = {}
        uri = self.get_uri()
        if uri is not None:
            result['bibcode'] = uri
        info = self.get_info()
        if 'title' in info:
            if len(info['Title']) != 0:
                if re.match(r'^[12][0-9][0-9][0-9]\w*', info['Title']) is not None:
                    result['title'] = info['Title']
        self.get_first_page()
        first_line = None
        with open('page.tmp', 'r') as tmp:
            icount = 0
            title = ''
            for line in tmp:
                if first_line is None:
                    first_line = line
                if len(line) > 2 and icount < 2:
                    title = title + ' ' + line.strip()
                    icount += 1
                elif line.startswith('arXiv'):
                    result['arxiv'] = line.split()[0][6:]
                elif line.startswith('doi:'):
                    result['identifier'] = line.split()[0][4:]
                if re.search(r'doi\:\s*(\S+)', line, flags=re.IGNORECASE) is not None:
                    result['identifier'] = re.search(r'doi\:\s*(\S+)',
                                                     line, flags=re.IGNORECASE).groups()[0]
            if len(title) > 0:
                result['title'] = title
        return result, first_line

if __name__ == '__main__':
    scan = PDFScanner(sys.argv[1])
    scan.get_uri()
    entry, first_line = scan.extract_search()
    is_ok = False
    #print entry, first_line
    if entry != {}:
        if 'arxiv' in entry:
            print arxiv2bib([entry['arxiv']])[0].bibtex()
            is_ok = True
        elif 'identifier' in entry:
            #import ipdb; ipdb.set_trace()
            query = SearchQuery(identifier=entry['identifier'])
            try:
                print query.next().bibtex
                is_ok = True
            except StopIteration:
                pass
        else:
            query = SearchQuery(**entry)
            try:
                print query.next().bibtex
                is_ok = True
            except StopIteration:
                pass
    if not is_ok:
        from pygoogle import pygoogle
        g = pygoogle('%s site:adsabs.harvard.edu' % first_line)
        g.pages = 1
        #print g.get_result_count()
        if g.get_result_count() > 0:
            result = g.get_urls()[0]
            query = SearchQuery(identifier=result.split('/')[-1])
            try:
                print query.next().bibtex
                is_ok = True
            except StopIteration:
                pass
        #else:
        #    g = pygoogle('%s site:iopscience.iop.org' % first_line)
        #    g.pages = 1
        #    import urllib2
        #    article_id = '/'.join(g.get_urls()[0].split('/')[3:])
        #    #print article_id
        #    url = 'http://iopscience.iop.org/export?articleId=%s&exportFormat=iopexport_bib&exportType=abs&navsubmit=Export%%2Babstract' % article_id
        #    #print url
        #    response = urllib2.urlopen(url)
        #    print response.read()
