Ne proposer que les établissements actifs et ne pas planter s'il y a des
[auf_savoirs_en_partage_django.git] / auf_savoirs_en_partage / savoirs / lib / harvesters / oai / generic.py
CommitLineData
0cc5f772 1# -*- encoding: utf-8 -*-
d972b61d 2import sys
0cc5f772
CR
3from lxml import etree
4from urlparse import urlparse, urljoin
5import sys, urllib, httplib, re, chardet
6
d972b61d 7from auf_savoirs_en_partage.savoirs.models import ListSet
ecc62927 8from auf_savoirs_en_partage.savoirs.globals import *
0cc5f772
CR
9from auf_savoirs_en_partage.savoirs.lib.utils \
10 import safe_append, print_structure, meta_set
11
12
13map = {'title': [TITLE,],
14 'creator': [CREATOR,],
15 'contributor': [CONTRIBUTOR,],
16 'subject': [SUBJECT,],
17 'description': [DESCRIPTION,],
18 'publisher': [PUBLISHER,],
19 'date': [DATE_CREATION,],
20 'type': [TYPE,],
21 'identifier': [IDENTIFIER, URI],
22 'format': [FORMAT,],
23 'relation': [SOURCE,],
24 }
25
26
27def connect(url):
28 handle = None
29 if url.scheme == 'https':
30 port = 443
31 if url.port is not None:
32 port = url.port
33 handle = httplib.HTTPSConnection (url.hostname, port)
34 else:
35 port = 80
36 if url.port is not None:
37 port = url.port
38 handle = httplib.HTTPConnection (url.hostname, port)
39 return handle
40
41def find_location (url_str):
42 url = urlparse(url_str)
43
d972b61d 44 possible = ("perl/oai2", "cgi/oai2", "cgi-bin/oaiserver", "oai/oai.php", "oai/oai2.php")
0cc5f772
CR
45 for test in possible:
46 path = url.path + test
47 handle = connect(url)
48 handle.request ("GET", path + "?verb=Identify")
49 r = handle.getresponse ()
50 if r.status == 200:
51 url = urlparse(urljoin(url.geturl(), test))
52 break
53
54 return url
55
56def load_xml (url):
57 ud = urllib.urlopen (url)
58 original = ud.read()
59 ud.close ()
60
61 encoding = chardet.detect(original)['encoding']
62 content = original.decode(encoding)
63
64 # Greenstone crap
65 content = content.replace ("\"http://www.openarchives.com/OAI/2.0\"",
66 "\"http://www.openarchives.org/OAI/2.0/\"")
67
8b95ddc9 68 pattern = re.compile(r"<([/:\w]+)[>\s]", re.I|re.U)
0cc5f772
CR
69 content = pattern.sub(lambda m: m.group(0).lower(), content)
70
71 # Other crap
72 content.replace("&", "&amp;")
73
d972b61d 74 try:
75 return etree.XML (content.encode("utf-8"))
76 except:
77 print "Erreur parser"
78 print original
79 sys.exit()
80
81def store_listsets(options):
82 """interroge le serveur pour récupérer tous les listsets et les stocke en bd."""
83
84 oai2ns = "{http://www.openarchives.org/OAI/2.0/}"
85 url = find_location (options['url'])
86 root = load_xml (url.geturl() + "?verb=ListSets")
87 sets = root.findall (".//%sset" % oai2ns)
88
89 listsets = [{'spec':e[0].text , 'name':e[1].text, 'server':options['server']} for e in sets]
90 for data in listsets:
91 ls, created = ListSet.objects.get_or_create(spec = data['spec'])
92 del data['spec']
93 for k,v in data.items():
94 setattr(ls, k, v)
95 ls.save()
0cc5f772
CR
96
97
98def harvest (options):
99 """Méthode de moissonage générique pour un système capable d'exporter ses
100 données au format `OAI <http://www.openarchives.org/>`_.
101
102 *options*
103 *options* est un dictionnaire, et doit contenir au moins les attributs
104 suivants:
105
106 *server*
107 Nom du serveur distant.
108 *port*
109 Port du service http.
110 *base_url*
111 Racine de l'acces OAI.
112
113 La méthode retourne une liste d'éléments correspondant au format de
114 metadonnées.
115 """
116 oai2ns = "{http://www.openarchives.org/OAI/2.0/}"
117 oaidc = "{http://www.openarchives.org/OAI/2.0/oai_dc/}dc"
118 metans = "{http://purl.org/dc/elements/1.1/}"
119
d972b61d 120 # récupère les listsets du serveur
121 store_listsets(options)
122
0cc5f772
CR
123 url = find_location (options['url'])
124
125 records = []
126 root = load_xml (url.geturl() + "?verb=ListRecords&metadataPrefix=oai_dc")
127 records.extend (root.findall (".//%srecord" % oai2ns))
d972b61d 128 token = root.find (".//%sresumptiontoken" % oai2ns)
129 print "total du serveur %s " % token.get("completeListSize")
0cc5f772
CR
130
131 while token is not None:
132 root = load_xml (url.geturl() + "?verb=ListRecords&resumptionToken=%s" % token.text)
133 records.extend (root.findall (".//%srecord" % oai2ns))
d972b61d 134 token = root.find (".//%sresumptiontoken" % oai2ns)
0cc5f772
CR
135
136
137 nodes = []
138 for record in records:
139 meta = {}
140 node = record.find (".//%sheader/%sdatestamp" % (oai2ns, oai2ns))
d972b61d 141
0cc5f772
CR
142 meta[DATE_MODIFIED] = node.text
143
144 dcnode = record.find (".//%s" % oaidc)
145 if dcnode is not None:
146 for c in dcnode.getchildren ():
147 if c.text:
148 c.text = c.text.strip ()
149 else:
150 c.text = ""
151
152 if len (c.text) > 0:
153 match = map.get (c.tag.replace (metans, ""), [])
154 if c.tag.replace(metans, "") == "identifier" \
155 and not c.text.startswith("http"):
156 pass
157 else:
158 for field in match:
159 meta_set (meta, field, c.text)
160
161 #print meta, etree.tostring(record, pretty_print = True)
162 if meta.get("uri") is None and meta.get("source") is not None:
163 meta['uri'] = meta['source']
164
a0fbb9e8 165 #un identifier doit être présent, s'il ne commence pas par http, prendre l'uri
166 #(i.e : aide-en-ligne fournit un identifier sous forme de titre)
167 if meta.get("identifier") is None:
e5ba3e3b 168 meta['identifier'] = meta.get('uri')
a0fbb9e8 169
d972b61d 170 # récupère les listsets associés
171 listsets = record.findall (".//%sheader/%ssetspec" % (oai2ns, oai2ns))
172 meta['listsets'] = [l.text for l in listsets]
173
0cc5f772
CR
174 if meta.get("uri") is not None:
175 nodes.append (meta)
d972b61d 176 print "total récupérés %s" % len(nodes)
0cc5f772 177 return nodes