3c49f786bbe8f68518ba6832495ae2fb75eee8c6
[auf_savoirs_en_partage_django.git] / auf_savoirs_en_partage / savoirs / lib / harvesters / oai / generic.py
1 # -*- encoding: utf-8 -*-
2 import sys
3 from lxml import etree
4 from urlparse import urlparse, urljoin
5 import sys, urllib, httplib, re, chardet
6
7 from auf_savoirs_en_partage.savoirs.models import ListSet
8 from auf_savoirs_en_partage.savoirs.globals import *
9 from auf_savoirs_en_partage.savoirs.lib.utils \
10 import safe_append, print_structure, meta_set
11
12
13 map = {'title': [TITLE,],
14 'creator': [CREATOR,],
15 'contributor': [CONTRIBUTOR,],
16 'subject': [SUBJECT,],
17 'description': [DESCRIPTION,],
18 'publisher': [PUBLISHER,],
19 'date': [DATE_CREATION,],
20 'type': [TYPE,],
21 'identifier': [IDENTIFIER, URI],
22 'format': [FORMAT,],
23 'relation': [SOURCE,],
24 }
25
26
27 def connect(url):
28 handle = None
29 if url.scheme == 'https':
30 port = 443
31 if url.port is not None:
32 port = url.port
33 handle = httplib.HTTPSConnection (url.hostname, port)
34 else:
35 port = 80
36 if url.port is not None:
37 port = url.port
38 handle = httplib.HTTPConnection (url.hostname, port)
39 return handle
40
41 def find_location (url_str):
42 url = urlparse(url_str)
43
44 possible = ("perl/oai2", "cgi/oai2", "cgi-bin/oaiserver", "oai/oai.php",
45 "oai/oai2.php", "oai/")
46 for test in possible:
47 path = url.path + test
48 handle = connect(url)
49 handle.request ("GET", path + "?verb=Identify")
50 r = handle.getresponse ()
51 if r.status == 200:
52 url = urlparse(urljoin(url.geturl(), test))
53 break
54
55 return url
56
57 def load_xml (url):
58 ud = urllib.urlopen (url)
59 original = ud.read()
60 ud.close ()
61
62 encoding = chardet.detect(original)['encoding']
63 content = original.decode(encoding)
64
65 # Greenstone crap
66 content = content.replace ("\"http://www.openarchives.com/OAI/2.0\"",
67 "\"http://www.openarchives.org/OAI/2.0/\"")
68
69 pattern = re.compile(r"<([/:\w]+)[>\s]", re.I|re.U)
70 content = pattern.sub(lambda m: m.group(0).lower(), content)
71
72 # Other crap
73 content.replace("&", "&amp;")
74
75 return etree.XML(content.encode("utf-8"))
76
77 def store_listsets(options):
78 """interroge le serveur pour récupérer tous les listsets et les stocke en bd."""
79
80 oai2ns = "{http://www.openarchives.org/OAI/2.0/}"
81 url = find_location (options['url'])
82 root = load_xml (url.geturl() + "?verb=ListSets")
83 sets = root.findall (".//%sset" % oai2ns)
84
85 listsets = [{'spec':e[0].text , 'name':e[1].text, 'server':options['server']} for e in sets]
86 for data in listsets:
87 ls, created = ListSet.objects.get_or_create(spec = data['spec'])
88 del data['spec']
89 for k,v in data.items():
90 setattr(ls, k, v)
91 ls.save()
92
93
94 def harvest (options):
95 """Méthode de moissonage générique pour un système capable d'exporter ses
96 données au format `OAI <http://www.openarchives.org/>`_.
97
98 *options*
99 *options* est un dictionnaire, et doit contenir au moins les attributs
100 suivants:
101
102 *server*
103 Nom du serveur distant.
104 *port*
105 Port du service http.
106 *base_url*
107 Racine de l'acces OAI.
108
109 La méthode retourne une liste d'éléments correspondant au format de
110 metadonnées.
111 """
112 oai2ns = "{http://www.openarchives.org/OAI/2.0/}"
113 oaidc = "{http://www.openarchives.org/OAI/2.0/oai_dc/}dc"
114 metans = "{http://purl.org/dc/elements/1.1/}"
115
116 # récupère les listsets du serveur
117 store_listsets(options)
118
119 url = find_location (options['url'])
120
121 records = []
122 root = load_xml (url.geturl() + "?verb=ListRecords&metadataPrefix=oai_dc")
123 records.extend (root.findall (".//%srecord" % oai2ns))
124 token = root.find (".//%sresumptiontoken" % oai2ns)
125 if token is not None:
126 print "total du serveur %s " % token.get("completeListSize")
127
128 while token is not None:
129 root = load_xml (url.geturl() + "?verb=ListRecords&resumptionToken=%s" % token.text)
130 records.extend (root.findall (".//%srecord" % oai2ns))
131 token = root.find (".//%sresumptiontoken" % oai2ns)
132
133
134 nodes = []
135 for record in records:
136 meta = {}
137 node = record.find (".//%sheader/%sdatestamp" % (oai2ns, oai2ns))
138
139 meta[DATE_MODIFIED] = node.text
140
141 dcnode = record.find (".//%s" % oaidc)
142 if dcnode is not None:
143 for c in dcnode.getchildren ():
144 if c.text:
145 c.text = c.text.strip ()
146 else:
147 c.text = ""
148
149 if len (c.text) > 0:
150 match = map.get (c.tag.replace (metans, ""), [])
151 if c.tag.replace(metans, "") == "identifier" \
152 and not c.text.startswith("http"):
153 pass
154 else:
155 for field in match:
156 meta_set (meta, field, c.text)
157
158 #print meta, etree.tostring(record, pretty_print = True)
159 if meta.get("uri") is None and meta.get("source") is not None:
160 meta['uri'] = meta['source']
161
162 #un identifier doit être présent, s'il ne commence pas par http, prendre l'uri
163 #(i.e : aide-en-ligne fournit un identifier sous forme de titre)
164 if meta.get("identifier") is None:
165 meta['identifier'] = meta.get('uri')
166
167 # récupère les listsets associés
168 listsets = record.findall (".//%sheader/%ssetspec" % (oai2ns, oai2ns))
169 meta['listsets'] = [l.text for l in listsets]
170
171 if meta.get("uri") is not None:
172 nodes.append (meta)
173 print "total récupérés %s" % len(nodes)
174 return nodes