Discipline et Thematiques classes utilisées a partir de savoir et datamaster
[auf_savoirs_en_partage_django.git] / auf_savoirs_en_partage / savoirs / lib / harvesters / oai / generic.py
CommitLineData
0cc5f772
CR
1# -*- encoding: utf-8 -*-
2from lxml import etree
3from urlparse import urlparse, urljoin
4import sys, urllib, httplib, re, chardet
5
ecc62927 6from auf_savoirs_en_partage.savoirs.globals import *
0cc5f772
CR
7from auf_savoirs_en_partage.savoirs.lib.utils \
8 import safe_append, print_structure, meta_set
9
10
11map = {'title': [TITLE,],
12 'creator': [CREATOR,],
13 'contributor': [CONTRIBUTOR,],
14 'subject': [SUBJECT,],
15 'description': [DESCRIPTION,],
16 'publisher': [PUBLISHER,],
17 'date': [DATE_CREATION,],
18 'type': [TYPE,],
19 'identifier': [IDENTIFIER, URI],
20 'format': [FORMAT,],
21 'relation': [SOURCE,],
22 }
23
24
25def connect(url):
26 handle = None
27 if url.scheme == 'https':
28 port = 443
29 if url.port is not None:
30 port = url.port
31 handle = httplib.HTTPSConnection (url.hostname, port)
32 else:
33 port = 80
34 if url.port is not None:
35 port = url.port
36 handle = httplib.HTTPConnection (url.hostname, port)
37 return handle
38
39def find_location (url_str):
40 url = urlparse(url_str)
41
42 possible = ("perl/oai2", "cgi/oai2", "cgi-bin/oaiserver", "oai/oai.php")
43 for test in possible:
44 path = url.path + test
45 handle = connect(url)
46 handle.request ("GET", path + "?verb=Identify")
47 r = handle.getresponse ()
48 if r.status == 200:
49 url = urlparse(urljoin(url.geturl(), test))
50 break
51
52 return url
53
54def load_xml (url):
55 ud = urllib.urlopen (url)
56 original = ud.read()
57 ud.close ()
58
59 encoding = chardet.detect(original)['encoding']
60 content = original.decode(encoding)
61
62 # Greenstone crap
63 content = content.replace ("\"http://www.openarchives.com/OAI/2.0\"",
64 "\"http://www.openarchives.org/OAI/2.0/\"")
65
8b95ddc9 66 pattern = re.compile(r"<([/:\w]+)[>\s]", re.I|re.U)
0cc5f772
CR
67 content = pattern.sub(lambda m: m.group(0).lower(), content)
68
69 # Other crap
70 content.replace("&", "&amp;")
71
72 return etree.XML (content.encode("utf-8"))
73
74
75def harvest (options):
76 """Méthode de moissonage générique pour un système capable d'exporter ses
77 données au format `OAI <http://www.openarchives.org/>`_.
78
79 *options*
80 *options* est un dictionnaire, et doit contenir au moins les attributs
81 suivants:
82
83 *server*
84 Nom du serveur distant.
85 *port*
86 Port du service http.
87 *base_url*
88 Racine de l'acces OAI.
89
90 La méthode retourne une liste d'éléments correspondant au format de
91 metadonnées.
92 """
93 oai2ns = "{http://www.openarchives.org/OAI/2.0/}"
94 oaidc = "{http://www.openarchives.org/OAI/2.0/oai_dc/}dc"
95 metans = "{http://purl.org/dc/elements/1.1/}"
96
97 url = find_location (options['url'])
98
99 records = []
100 root = load_xml (url.geturl() + "?verb=ListRecords&metadataPrefix=oai_dc")
101 records.extend (root.findall (".//%srecord" % oai2ns))
102 token = root.find (".//%sresumptionToken" % oai2ns)
103
104 while token is not None:
105 root = load_xml (url.geturl() + "?verb=ListRecords&resumptionToken=%s" % token.text)
106 records.extend (root.findall (".//%srecord" % oai2ns))
107 token = root.find (".//%sresumptionToken" % oai2ns)
108
109
110 nodes = []
111 for record in records:
112 meta = {}
113 node = record.find (".//%sheader/%sdatestamp" % (oai2ns, oai2ns))
114 meta[DATE_MODIFIED] = node.text
115
116 dcnode = record.find (".//%s" % oaidc)
117 if dcnode is not None:
118 for c in dcnode.getchildren ():
119 if c.text:
120 c.text = c.text.strip ()
121 else:
122 c.text = ""
123
124 if len (c.text) > 0:
125 match = map.get (c.tag.replace (metans, ""), [])
126 if c.tag.replace(metans, "") == "identifier" \
127 and not c.text.startswith("http"):
128 pass
129 else:
130 for field in match:
131 meta_set (meta, field, c.text)
132
133 #print meta, etree.tostring(record, pretty_print = True)
134 if meta.get("uri") is None and meta.get("source") is not None:
135 meta['uri'] = meta['source']
136
137 if meta.get("uri") is not None:
138 nodes.append (meta)
139
140 return nodes