Progres de la journée
authorCyril Robert <Cyril Robert crobert@inverse.ca>
Thu, 25 Feb 2010 22:18:50 +0000 (17:18 -0500)
committerCyril Robert <Cyril Robert crobert@inverse.ca>
Thu, 25 Feb 2010 22:18:50 +0000 (17:18 -0500)
Apps/auf_sep/.conf.py.swp [deleted file]
Apps/auf_sep/globals.py [new file with mode: 0644]
Apps/auf_sep/harvest/harvest.py
Apps/auf_sep/harvest/harvesters/pmb/__init__.py [new file with mode: 0644]
Apps/auf_sep/harvest/harvesters/pmb/export.py [new file with mode: 0644]
Apps/auf_sep/harvest/harvesters/pmb/pmbclient.py [new file with mode: 0644]
Apps/auf_sep/sep/.io.py.swp [deleted file]
Apps/auf_sep/setup.py
Apps/auf_sep/storage/.json.py.swp [deleted file]

diff --git a/Apps/auf_sep/.conf.py.swp b/Apps/auf_sep/.conf.py.swp
deleted file mode 100644 (file)
index b680080..0000000
Binary files a/Apps/auf_sep/.conf.py.swp and /dev/null differ
diff --git a/Apps/auf_sep/globals.py b/Apps/auf_sep/globals.py
new file mode 100644 (file)
index 0000000..90c128d
--- /dev/null
@@ -0,0 +1,28 @@
+# -*- encoding: utf-8 -*-
+
+
+#####
+# Meta fields
+TITLE = 'title'
+CREATOR = 'creator'
+TOC = 'toc'
+ABSTRACT = 'abstract'
+PUBLISHER = 'publisher'
+CONTRIBUTOR = 'contributor'
+DATE_CREATION = 'creation'
+DATE_VALID = 'valid'
+DATE_AVAILABLE = 'available'
+DATE_ISSUED = 'issued'
+DATE_MODIFIED = 'modified'
+DATE_ACCEPTED = 'accepted'
+DATE_COPYRIGHTED = 'copyrighted'
+DATE_SUBMITTED = 'submitted'
+TYPE = 'type'
+FORMAT = 'format'
+MEDIUM = 'medium'
+IDENTIFIER = 'identifier'
+URI = 'uri'
+SOURCE = 'source'
+LANGUAGE = 'language'
+COVERAGE = 'coverage'
+
index e69de29..0fa37d8 100644 (file)
@@ -0,0 +1,12 @@
+import sys
+
+sys.path.append ("../")
+from conf import RESOURCES
+
+
+for name in RESOURCES.keys ():
+    options = RESOURCES[name]
+    module = 'harvesters.%s.%s' % (options['type'], options['acces'])
+    __import__ (module)
+    harvester = sys.modules[module]
+    nodes = harvester.harvest (options)
diff --git a/Apps/auf_sep/harvest/harvesters/pmb/__init__.py b/Apps/auf_sep/harvest/harvesters/pmb/__init__.py
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/Apps/auf_sep/harvest/harvesters/pmb/export.py b/Apps/auf_sep/harvest/harvesters/pmb/export.py
new file mode 100644 (file)
index 0000000..d60940d
--- /dev/null
@@ -0,0 +1,71 @@
+# -*- encoding: utf-8 -*-
+from pmbclient import PmbClient
+from lxml import etree
+import sys, re
+
+sys.path.append ("../../../")
+from globals import *
+
+def print_structure(element, tab=""):
+    if element.text:
+        element.text = element.text.strip ()
+    if element.tail:
+        element.tail = element.tail.strip ()
+    print(u"%s<%s>: %s, %s, %s" % (tab, element.tag, element.text, 
+        element.tail, element.attrib))
+    for x in element.getchildren():
+        print_structure(x, tab+" ")
+
+
+def findtext (node, tag):
+    rc = ""
+    n = node.find (".//%s" % tag)
+    if n is not None:
+        rc = n.text
+
+    return rc
+
+
+def harvest (options):
+    c = PmbClient ()
+    c.connect (options['host'])
+
+    params = {'user': options['username'], 
+            'password': options['password'], 
+            'database': options['db']}
+    c.login (params, options['login_script'])
+
+    params = {'export_type': '14', 'lender': 'x'}
+    content = c.export (params, options['export_script'])
+    print content
+
+    root = etree.XML (content)
+    article_nodes = root.findall (".//notice")
+    nodes = []
+    for node in article_nodes:
+        meta = {}
+        for c in node.getchildren ():
+            if c.tag == 'idNotice':
+                meta[IDENTIFIER] = c.text
+            elif c.tag == 'zoneTitre':
+                meta[TITLE] = findtext (c, 'titrePrincipal')
+            elif c.tag == 'zoneAuteurPrincipal':
+                meta[CREATOR] = u"%s %s" % (findtext(c, "prenom"), 
+                        findtext(c, "nom"))
+            elif c.tag == 'zoneAuteursAutres':
+                meta[CONTRIBUTOR] = u"%s %s" % (findtext(c, "prenom"), 
+                        findtext(c, "nom"))
+            elif c.tag == 'zoneEditeur':
+                meta[PUBLISHER] = u"%s, %s (%s)" % (findtext(c, "nom"), 
+                        findtext(c, "ville"), findtext(c, "annee"))
+            elif c.tag == 'zoneCategories':
+                meta[COVERAGE] = findtext (c, "categorie")
+            #else:
+            #    print c.tag, "ignoré"
+
+        meta[URL] = "http://%s/%s/catalog.php?id=%s" % \
+                (options['host'], 'pmb', meta[IDENTIFIER])
+        nodes.append (meta)
+    return nodes
+
+
diff --git a/Apps/auf_sep/harvest/harvesters/pmb/pmbclient.py b/Apps/auf_sep/harvest/harvesters/pmb/pmbclient.py
new file mode 100644 (file)
index 0000000..05c1d63
--- /dev/null
@@ -0,0 +1,122 @@
+import httplib, urllib, Cookie, time
+from exceptions import Exception
+from lxml import etree
+
+
+class PmbClient:
+    handle = None
+    cookies = ""
+
+    def __init__ (self):
+        pass
+
+    def __del__ (self):
+        if self.handle:
+            self.handle.close ()
+
+    def connect (self, server, port=80):
+        if self.handle:
+            self.handle.close ()
+
+        self.handle = httplib.HTTPConnection (server, port)
+
+    def login (self, params, script):
+        params = urllib.urlencode(params)
+        headers = {"Content-type": "application/x-www-form-urlencoded",
+                "Accept": "text/plain"}
+
+        self.handle.request ("POST", script, params, headers)
+        r = self.handle.getresponse ()
+
+        if r.status != 200:
+            raise Exception ("Login failed: %s %s" % (r.status, r.reason))
+        r.read ()
+
+        bc = Cookie.BaseCookie ()
+        bc.load (r.getheader ("set-cookie"))
+        tmp = bc.output (header="Cookie:").replace (",", "").split("\n")
+        cookies = []
+        for line in tmp:
+            cookie = line.replace ("Cookie: ", "").strip ()
+            cookies.append (cookie)
+        self.cookies = "; ".join (cookies)
+
+    def get_response (self):
+        r = None
+        while r is None:
+            try:
+                r = self.handle.getresponse ()
+            except:
+                r = None
+            time.sleep (1)
+        return r
+
+    def find_next_location (self, buffer):
+        rc = ""
+
+        root = etree.HTML (buffer)
+        tmp = root.findall (".//script")
+        script = tmp[len(tmp)-1]
+
+        buffer = script.text
+        match = "document.location='"
+        i = buffer.rfind (match)
+        if i >= 0:
+            i += len (match)
+            tmp = buffer[i:]
+            j = tmp.find ("'\"")
+            if j >= 0:
+                rc = tmp[0:j]
+        return rc
+
+    def make_url (self, old, script):
+        tmp = old.split ("/")
+        tmp[len(tmp)-1] = script
+        nextscript = "/".join(tmp)
+        return nextscript
+
+    def read_form (self, buffer):
+        script = ""
+        params = {}
+
+        root = etree.HTML(buffer)
+        form = root.find (".//form")
+        script = form.attrib['action']
+        
+        inputs = form.findall (".//input")
+        for input in inputs:
+            try:
+                params[input.attrib['name']] = input.attrib['value']
+            except:
+                pass
+
+        return (params, script)
+
+    def export (self, params, script):
+        params = urllib.urlencode(params)
+        headers = {"Content-type": "application/x-www-form-urlencoded",
+                "Accept": "text/plain",
+                "Cookie": self.cookies}
+
+        self.handle.request ("POST", script, params, headers)
+        r = self.get_response ()
+        next = self.find_next_location (r.read ())
+        nextscript = self.make_url (script, next)
+
+        self.handle.request ("GET", nextscript, headers={"Cookie": self.cookies})
+        r = self.get_response ()
+        next = self.find_next_location (r.read ())
+        nextscript = self.make_url (script, next)
+
+        self.handle.request ("GET", nextscript, headers={"Cookie": self.cookies})
+        r = self.get_response ()
+        (params, next) = self.read_form (r.read ())
+
+        nextscript = self.make_url (script, next)
+        params = urllib.urlencode(params)
+        self.handle.request ("POST", nextscript, params, headers)
+        r = self.get_response ()
+        content = r.read ()
+
+        # Saloperie de PMB force le retour en iso crap
+        return content.decode('iso-8859-1').encode('utf-8')
diff --git a/Apps/auf_sep/sep/.io.py.swp b/Apps/auf_sep/sep/.io.py.swp
deleted file mode 100644 (file)
index 228a693..0000000
Binary files a/Apps/auf_sep/sep/.io.py.swp and /dev/null differ
index a78276e..487d1e6 100644 (file)
@@ -18,5 +18,5 @@ setup(
         package_dir={'auf_sep': '.',},
         include_package_data=True,
         zip_safe=False,
-        install_requires=[],
+        install_requires=["lovely.jsonrpc",],
         )
diff --git a/Apps/auf_sep/storage/.json.py.swp b/Apps/auf_sep/storage/.json.py.swp
deleted file mode 100644 (file)
index 567a0ba..0000000
Binary files a/Apps/auf_sep/storage/.json.py.swp and /dev/null differ