1 # -*- encoding: utf-8 -*-
4 from urlparse
import urlparse
, urljoin
5 import sys
, urllib
, httplib
, re
, chardet
7 from auf_savoirs_en_partage
.savoirs
.models
import ListSet
8 from auf_savoirs_en_partage
.savoirs
.globals import *
9 from auf_savoirs_en_partage
.savoirs
.lib
.utils \
10 import safe_append
, print_structure
, meta_set
13 map = {'title': [TITLE
,],
14 'creator': [CREATOR
,],
15 'contributor': [CONTRIBUTOR
,],
16 'subject': [SUBJECT
,],
17 'description': [DESCRIPTION
,],
18 'publisher': [PUBLISHER
,],
19 'date': [DATE_CREATION
,],
21 'identifier': [IDENTIFIER
, URI
],
23 'relation': [SOURCE
,],
29 if url
.scheme
== 'https':
31 if url
.port
is not None:
33 handle
= httplib
.HTTPSConnection (url
.hostname
, port
)
36 if url
.port
is not None:
38 handle
= httplib
.HTTPConnection (url
.hostname
, port
)
41 def find_location (url_str
):
42 url
= urlparse(url_str
)
44 possible
= ("perl/oai2", "cgi/oai2", "cgi-bin/oaiserver", "oai/oai.php", "oai/oai2.php")
46 path
= url
.path
+ test
48 handle
.request ("GET", path
+ "?verb=Identify")
49 r
= handle
.getresponse ()
51 url
= urlparse(urljoin(url
.geturl(), test
))
57 ud
= urllib
.urlopen (url
)
61 encoding
= chardet
.detect(original
)['encoding']
62 content
= original
.decode(encoding
)
65 content
= content
.replace ("\"http://www.openarchives.com/OAI/2.0\"",
66 "\"http://www.openarchives.org/OAI/2.0/\"")
68 pattern
= re
.compile(r
"<([/:\w]+)[>\s]", re
.I|re
.U
)
69 content
= pattern
.sub(lambda m
: m
.group(0).lower(), content
)
72 content
.replace("&", "&")
74 return etree
.XML(content
.encode("utf-8"))
76 def store_listsets(options
):
77 """interroge le serveur pour récupérer tous les listsets et les stocke en bd."""
79 oai2ns
= "{http://www.openarchives.org/OAI/2.0/}"
80 url
= find_location (options
['url'])
81 root
= load_xml (url
.geturl() + "?verb=ListSets")
82 sets
= root
.findall (".//%sset" % oai2ns
)
84 listsets
= [{'spec':e
[0].text
, 'name':e
[1].text
, 'server':options
['server']} for e
in sets
]
86 ls
, created
= ListSet
.objects
.get_or_create(spec
= data
['spec'])
88 for k
,v
in data
.items():
93 def harvest (options
):
94 """Méthode de moissonage générique pour un système capable d'exporter ses
95 données au format `OAI <http://www.openarchives.org/>`_.
98 *options* est un dictionnaire, et doit contenir au moins les attributs
102 Nom du serveur distant.
104 Port du service http.
106 Racine de l'acces OAI.
108 La méthode retourne une liste d'éléments correspondant au format de
111 oai2ns
= "{http://www.openarchives.org/OAI/2.0/}"
112 oaidc
= "{http://www.openarchives.org/OAI/2.0/oai_dc/}dc"
113 metans
= "{http://purl.org/dc/elements/1.1/}"
115 # récupère les listsets du serveur
116 store_listsets(options
)
118 url
= find_location (options
['url'])
121 root
= load_xml (url
.geturl() + "?verb=ListRecords&metadataPrefix=oai_dc")
122 records
.extend (root
.findall (".//%srecord" % oai2ns
))
123 token
= root
.find (".//%sresumptiontoken" % oai2ns
)
124 if token
is not None:
125 print "total du serveur %s " % token
.get("completeListSize")
127 while token
is not None:
128 root
= load_xml (url
.geturl() + "?verb=ListRecords&resumptionToken=%s" % token
.text
)
129 records
.extend (root
.findall (".//%srecord" % oai2ns
))
130 token
= root
.find (".//%sresumptiontoken" % oai2ns
)
134 for record
in records
:
136 node
= record
.find (".//%sheader/%sdatestamp" % (oai2ns
, oai2ns
))
138 meta
[DATE_MODIFIED
] = node
.text
140 dcnode
= record
.find (".//%s" % oaidc
)
141 if dcnode
is not None:
142 for c
in dcnode
.getchildren ():
144 c
.text
= c
.text
.strip ()
149 match
= map.get (c
.tag
.replace (metans
, ""), [])
150 if c
.tag
.replace(metans
, "") == "identifier" \
151 and not c
.text
.startswith("http"):
155 meta_set (meta
, field
, c
.text
)
157 #print meta, etree.tostring(record, pretty_print = True)
158 if meta
.get("uri") is None and meta
.get("source") is not None:
159 meta
['uri'] = meta
['source']
161 #un identifier doit être présent, s'il ne commence pas par http, prendre l'uri
162 #(i.e : aide-en-ligne fournit un identifier sous forme de titre)
163 if meta
.get("identifier") is None:
164 meta
['identifier'] = meta
.get('uri')
166 # récupère les listsets associés
167 listsets
= record
.findall (".//%sheader/%ssetspec" % (oai2ns
, oai2ns
))
168 meta
['listsets'] = [l
.text
for l
in listsets
]
170 if meta
.get("uri") is not None:
172 print "total récupérés %s" % len(nodes
)