Commit | Line | Data |
---|---|---|
0cc5f772 | 1 | # -*- encoding: utf-8 -*- |
d972b61d | 2 | import sys |
0cc5f772 CR |
3 | from lxml import etree |
4 | from urlparse import urlparse, urljoin | |
5 | import sys, urllib, httplib, re, chardet | |
6 | ||
d972b61d | 7 | from auf_savoirs_en_partage.savoirs.models import ListSet |
ecc62927 | 8 | from auf_savoirs_en_partage.savoirs.globals import * |
0cc5f772 CR |
9 | from auf_savoirs_en_partage.savoirs.lib.utils \ |
10 | import safe_append, print_structure, meta_set | |
11 | ||
12 | ||
13 | map = {'title': [TITLE,], | |
14 | 'creator': [CREATOR,], | |
15 | 'contributor': [CONTRIBUTOR,], | |
16 | 'subject': [SUBJECT,], | |
17 | 'description': [DESCRIPTION,], | |
18 | 'publisher': [PUBLISHER,], | |
19 | 'date': [DATE_CREATION,], | |
20 | 'type': [TYPE,], | |
21 | 'identifier': [IDENTIFIER, URI], | |
22 | 'format': [FORMAT,], | |
23 | 'relation': [SOURCE,], | |
24 | } | |
25 | ||
26 | ||
27 | def connect(url): | |
28 | handle = None | |
29 | if url.scheme == 'https': | |
30 | port = 443 | |
31 | if url.port is not None: | |
32 | port = url.port | |
33 | handle = httplib.HTTPSConnection (url.hostname, port) | |
34 | else: | |
35 | port = 80 | |
36 | if url.port is not None: | |
37 | port = url.port | |
38 | handle = httplib.HTTPConnection (url.hostname, port) | |
39 | return handle | |
40 | ||
41 | def find_location (url_str): | |
42 | url = urlparse(url_str) | |
43 | ||
4e778d4f EMS |
44 | possible = ("perl/oai2", "cgi/oai2", "cgi-bin/oaiserver", "oai/oai.php", |
45 | "oai/oai2.php", "oai/", "") | |
0cc5f772 CR |
46 | for test in possible: |
47 | path = url.path + test | |
48 | handle = connect(url) | |
49 | handle.request ("GET", path + "?verb=Identify") | |
50 | r = handle.getresponse () | |
51 | if r.status == 200: | |
52 | url = urlparse(urljoin(url.geturl(), test)) | |
53 | break | |
54 | ||
55 | return url | |
56 | ||
57 | def load_xml (url): | |
58 | ud = urllib.urlopen (url) | |
59 | original = ud.read() | |
60 | ud.close () | |
61 | ||
62 | encoding = chardet.detect(original)['encoding'] | |
46b54a81 | 63 | content = original.decode(encoding or 'utf-8') |
0cc5f772 CR |
64 | |
65 | # Greenstone crap | |
66 | content = content.replace ("\"http://www.openarchives.com/OAI/2.0\"", | |
67 | "\"http://www.openarchives.org/OAI/2.0/\"") | |
68 | ||
8b95ddc9 | 69 | pattern = re.compile(r"<([/:\w]+)[>\s]", re.I|re.U) |
0cc5f772 CR |
70 | content = pattern.sub(lambda m: m.group(0).lower(), content) |
71 | ||
72 | # Other crap | |
73 | content.replace("&", "&") | |
db134a95 EMS |
74 | |
75 | return etree.XML(content.encode("utf-8")) | |
d972b61d | 76 | |
77 | def store_listsets(options): | |
78 | """interroge le serveur pour récupérer tous les listsets et les stocke en bd.""" | |
79 | ||
80 | oai2ns = "{http://www.openarchives.org/OAI/2.0/}" | |
81 | url = find_location (options['url']) | |
82 | root = load_xml (url.geturl() + "?verb=ListSets") | |
83 | sets = root.findall (".//%sset" % oai2ns) | |
84 | ||
85 | listsets = [{'spec':e[0].text , 'name':e[1].text, 'server':options['server']} for e in sets] | |
86 | for data in listsets: | |
87 | ls, created = ListSet.objects.get_or_create(spec = data['spec']) | |
88 | del data['spec'] | |
89 | for k,v in data.items(): | |
90 | setattr(ls, k, v) | |
91 | ls.save() | |
0cc5f772 CR |
92 | |
93 | ||
94 | def harvest (options): | |
95 | """Méthode de moissonage générique pour un système capable d'exporter ses | |
96 | données au format `OAI <http://www.openarchives.org/>`_. | |
97 | ||
98 | *options* | |
99 | *options* est un dictionnaire, et doit contenir au moins les attributs | |
100 | suivants: | |
101 | ||
102 | *server* | |
103 | Nom du serveur distant. | |
104 | *port* | |
105 | Port du service http. | |
106 | *base_url* | |
107 | Racine de l'acces OAI. | |
108 | ||
109 | La méthode retourne une liste d'éléments correspondant au format de | |
110 | metadonnées. | |
111 | """ | |
112 | oai2ns = "{http://www.openarchives.org/OAI/2.0/}" | |
113 | oaidc = "{http://www.openarchives.org/OAI/2.0/oai_dc/}dc" | |
114 | metans = "{http://purl.org/dc/elements/1.1/}" | |
115 | ||
d972b61d | 116 | # récupère les listsets du serveur |
117 | store_listsets(options) | |
118 | ||
0cc5f772 CR |
119 | url = find_location (options['url']) |
120 | ||
121 | records = [] | |
122 | root = load_xml (url.geturl() + "?verb=ListRecords&metadataPrefix=oai_dc") | |
123 | records.extend (root.findall (".//%srecord" % oai2ns)) | |
d972b61d | 124 | token = root.find (".//%sresumptiontoken" % oai2ns) |
db134a95 EMS |
125 | if token is not None: |
126 | print "total du serveur %s " % token.get("completeListSize") | |
0cc5f772 CR |
127 | |
128 | while token is not None: | |
129 | root = load_xml (url.geturl() + "?verb=ListRecords&resumptionToken=%s" % token.text) | |
130 | records.extend (root.findall (".//%srecord" % oai2ns)) | |
d972b61d | 131 | token = root.find (".//%sresumptiontoken" % oai2ns) |
0cc5f772 CR |
132 | |
133 | ||
134 | nodes = [] | |
135 | for record in records: | |
136 | meta = {} | |
137 | node = record.find (".//%sheader/%sdatestamp" % (oai2ns, oai2ns)) | |
d972b61d | 138 | |
0cc5f772 CR |
139 | meta[DATE_MODIFIED] = node.text |
140 | ||
141 | dcnode = record.find (".//%s" % oaidc) | |
142 | if dcnode is not None: | |
143 | for c in dcnode.getchildren (): | |
144 | if c.text: | |
145 | c.text = c.text.strip () | |
146 | else: | |
147 | c.text = "" | |
148 | ||
149 | if len (c.text) > 0: | |
150 | match = map.get (c.tag.replace (metans, ""), []) | |
151 | if c.tag.replace(metans, "") == "identifier" \ | |
152 | and not c.text.startswith("http"): | |
153 | pass | |
154 | else: | |
155 | for field in match: | |
156 | meta_set (meta, field, c.text) | |
157 | ||
158 | #print meta, etree.tostring(record, pretty_print = True) | |
159 | if meta.get("uri") is None and meta.get("source") is not None: | |
160 | meta['uri'] = meta['source'] | |
161 | ||
a0fbb9e8 | 162 | #un identifier doit être présent, s'il ne commence pas par http, prendre l'uri |
163 | #(i.e : aide-en-ligne fournit un identifier sous forme de titre) | |
164 | if meta.get("identifier") is None: | |
e5ba3e3b | 165 | meta['identifier'] = meta.get('uri') |
a0fbb9e8 | 166 | |
d972b61d | 167 | # récupère les listsets associés |
168 | listsets = record.findall (".//%sheader/%ssetspec" % (oai2ns, oai2ns)) | |
169 | meta['listsets'] = [l.text for l in listsets] | |
170 | ||
0cc5f772 CR |
171 | if meta.get("uri") is not None: |
172 | nodes.append (meta) | |
d972b61d | 173 | print "total récupérés %s" % len(nodes) |
0cc5f772 | 174 | return nodes |