Mauvais argument pour strftime()
[auf_savoirs_en_partage_django.git] / auf_savoirs_en_partage / savoirs / lib / harvesters / oai / generic.py
CommitLineData
0cc5f772 1# -*- encoding: utf-8 -*-
d972b61d 2import sys
0cc5f772
CR
3from lxml import etree
4from urlparse import urlparse, urljoin
5import sys, urllib, httplib, re, chardet
6
d972b61d 7from auf_savoirs_en_partage.savoirs.models import ListSet
ecc62927 8from auf_savoirs_en_partage.savoirs.globals import *
0cc5f772
CR
9from auf_savoirs_en_partage.savoirs.lib.utils \
10 import safe_append, print_structure, meta_set
11
12
13map = {'title': [TITLE,],
14 'creator': [CREATOR,],
15 'contributor': [CONTRIBUTOR,],
16 'subject': [SUBJECT,],
17 'description': [DESCRIPTION,],
18 'publisher': [PUBLISHER,],
19 'date': [DATE_CREATION,],
20 'type': [TYPE,],
21 'identifier': [IDENTIFIER, URI],
22 'format': [FORMAT,],
23 'relation': [SOURCE,],
24 }
25
26
27def connect(url):
28 handle = None
29 if url.scheme == 'https':
30 port = 443
31 if url.port is not None:
32 port = url.port
33 handle = httplib.HTTPSConnection (url.hostname, port)
34 else:
35 port = 80
36 if url.port is not None:
37 port = url.port
38 handle = httplib.HTTPConnection (url.hostname, port)
39 return handle
40
41def find_location (url_str):
42 url = urlparse(url_str)
43
d972b61d 44 possible = ("perl/oai2", "cgi/oai2", "cgi-bin/oaiserver", "oai/oai.php", "oai/oai2.php")
0cc5f772
CR
45 for test in possible:
46 path = url.path + test
47 handle = connect(url)
48 handle.request ("GET", path + "?verb=Identify")
49 r = handle.getresponse ()
50 if r.status == 200:
51 url = urlparse(urljoin(url.geturl(), test))
52 break
53
54 return url
55
56def load_xml (url):
57 ud = urllib.urlopen (url)
58 original = ud.read()
59 ud.close ()
60
61 encoding = chardet.detect(original)['encoding']
62 content = original.decode(encoding)
63
64 # Greenstone crap
65 content = content.replace ("\"http://www.openarchives.com/OAI/2.0\"",
66 "\"http://www.openarchives.org/OAI/2.0/\"")
67
8b95ddc9 68 pattern = re.compile(r"<([/:\w]+)[>\s]", re.I|re.U)
0cc5f772
CR
69 content = pattern.sub(lambda m: m.group(0).lower(), content)
70
71 # Other crap
72 content.replace("&", "&amp;")
db134a95
EMS
73
74 return etree.XML(content.encode("utf-8"))
d972b61d 75
76def store_listsets(options):
77 """interroge le serveur pour récupérer tous les listsets et les stocke en bd."""
78
79 oai2ns = "{http://www.openarchives.org/OAI/2.0/}"
80 url = find_location (options['url'])
81 root = load_xml (url.geturl() + "?verb=ListSets")
82 sets = root.findall (".//%sset" % oai2ns)
83
84 listsets = [{'spec':e[0].text , 'name':e[1].text, 'server':options['server']} for e in sets]
85 for data in listsets:
86 ls, created = ListSet.objects.get_or_create(spec = data['spec'])
87 del data['spec']
88 for k,v in data.items():
89 setattr(ls, k, v)
90 ls.save()
0cc5f772
CR
91
92
93def harvest (options):
94 """Méthode de moissonage générique pour un système capable d'exporter ses
95 données au format `OAI <http://www.openarchives.org/>`_.
96
97 *options*
98 *options* est un dictionnaire, et doit contenir au moins les attributs
99 suivants:
100
101 *server*
102 Nom du serveur distant.
103 *port*
104 Port du service http.
105 *base_url*
106 Racine de l'acces OAI.
107
108 La méthode retourne une liste d'éléments correspondant au format de
109 metadonnées.
110 """
111 oai2ns = "{http://www.openarchives.org/OAI/2.0/}"
112 oaidc = "{http://www.openarchives.org/OAI/2.0/oai_dc/}dc"
113 metans = "{http://purl.org/dc/elements/1.1/}"
114
d972b61d 115 # récupère les listsets du serveur
116 store_listsets(options)
117
0cc5f772
CR
118 url = find_location (options['url'])
119
120 records = []
121 root = load_xml (url.geturl() + "?verb=ListRecords&metadataPrefix=oai_dc")
122 records.extend (root.findall (".//%srecord" % oai2ns))
d972b61d 123 token = root.find (".//%sresumptiontoken" % oai2ns)
db134a95
EMS
124 if token is not None:
125 print "total du serveur %s " % token.get("completeListSize")
0cc5f772
CR
126
127 while token is not None:
128 root = load_xml (url.geturl() + "?verb=ListRecords&resumptionToken=%s" % token.text)
129 records.extend (root.findall (".//%srecord" % oai2ns))
d972b61d 130 token = root.find (".//%sresumptiontoken" % oai2ns)
0cc5f772
CR
131
132
133 nodes = []
134 for record in records:
135 meta = {}
136 node = record.find (".//%sheader/%sdatestamp" % (oai2ns, oai2ns))
d972b61d 137
0cc5f772
CR
138 meta[DATE_MODIFIED] = node.text
139
140 dcnode = record.find (".//%s" % oaidc)
141 if dcnode is not None:
142 for c in dcnode.getchildren ():
143 if c.text:
144 c.text = c.text.strip ()
145 else:
146 c.text = ""
147
148 if len (c.text) > 0:
149 match = map.get (c.tag.replace (metans, ""), [])
150 if c.tag.replace(metans, "") == "identifier" \
151 and not c.text.startswith("http"):
152 pass
153 else:
154 for field in match:
155 meta_set (meta, field, c.text)
156
157 #print meta, etree.tostring(record, pretty_print = True)
158 if meta.get("uri") is None and meta.get("source") is not None:
159 meta['uri'] = meta['source']
160
a0fbb9e8 161 #un identifier doit être présent, s'il ne commence pas par http, prendre l'uri
162 #(i.e : aide-en-ligne fournit un identifier sous forme de titre)
163 if meta.get("identifier") is None:
e5ba3e3b 164 meta['identifier'] = meta.get('uri')
a0fbb9e8 165
d972b61d 166 # récupère les listsets associés
167 listsets = record.findall (".//%sheader/%ssetspec" % (oai2ns, oai2ns))
168 meta['listsets'] = [l.text for l in listsets]
169
0cc5f772
CR
170 if meta.get("uri") is not None:
171 nodes.append (meta)
d972b61d 172 print "total récupérés %s" % len(nodes)
0cc5f772 173 return nodes