1 # -*- encoding: utf-8 -*-
4 from urlparse
import urlparse
, urljoin
5 import sys
, urllib
, httplib
, re
, chardet
7 from auf_savoirs_en_partage
.savoirs
.models
import ListSet
8 from auf_savoirs_en_partage
.savoirs
.globals import *
9 from auf_savoirs_en_partage
.savoirs
.lib
.utils \
10 import safe_append
, print_structure
, meta_set
13 map = {'title': [TITLE
,],
14 'creator': [CREATOR
,],
15 'contributor': [CONTRIBUTOR
,],
16 'subject': [SUBJECT
,],
17 'description': [DESCRIPTION
,],
18 'publisher': [PUBLISHER
,],
19 'date': [DATE_CREATION
,],
21 'identifier': [IDENTIFIER
, URI
],
23 'relation': [SOURCE
,],
29 if url
.scheme
== 'https':
31 if url
.port
is not None:
33 handle
= httplib
.HTTPSConnection (url
.hostname
, port
)
36 if url
.port
is not None:
38 handle
= httplib
.HTTPConnection (url
.hostname
, port
)
41 def find_location (url_str
):
42 url
= urlparse(url_str
)
44 possible
= ("perl/oai2", "cgi/oai2", "cgi-bin/oaiserver", "oai/oai.php",
45 "oai/oai2.php", "oai/")
47 path
= url
.path
+ test
49 handle
.request ("GET", path
+ "?verb=Identify")
50 r
= handle
.getresponse ()
52 url
= urlparse(urljoin(url
.geturl(), test
))
58 ud
= urllib
.urlopen (url
)
62 encoding
= chardet
.detect(original
)['encoding']
63 content
= original
.decode(encoding
)
66 content
= content
.replace ("\"http://www.openarchives.com/OAI/2.0\"",
67 "\"http://www.openarchives.org/OAI/2.0/\"")
69 pattern
= re
.compile(r
"<([/:\w]+)[>\s]", re
.I|re
.U
)
70 content
= pattern
.sub(lambda m
: m
.group(0).lower(), content
)
73 content
.replace("&", "&")
75 return etree
.XML(content
.encode("utf-8"))
77 def store_listsets(options
):
78 """interroge le serveur pour récupérer tous les listsets et les stocke en bd."""
80 oai2ns
= "{http://www.openarchives.org/OAI/2.0/}"
81 url
= find_location (options
['url'])
82 root
= load_xml (url
.geturl() + "?verb=ListSets")
83 sets
= root
.findall (".//%sset" % oai2ns
)
85 listsets
= [{'spec':e
[0].text
, 'name':e
[1].text
, 'server':options
['server']} for e
in sets
]
87 ls
, created
= ListSet
.objects
.get_or_create(spec
= data
['spec'])
89 for k
,v
in data
.items():
94 def harvest (options
):
95 """Méthode de moissonage générique pour un système capable d'exporter ses
96 données au format `OAI <http://www.openarchives.org/>`_.
99 *options* est un dictionnaire, et doit contenir au moins les attributs
103 Nom du serveur distant.
105 Port du service http.
107 Racine de l'acces OAI.
109 La méthode retourne une liste d'éléments correspondant au format de
112 oai2ns
= "{http://www.openarchives.org/OAI/2.0/}"
113 oaidc
= "{http://www.openarchives.org/OAI/2.0/oai_dc/}dc"
114 metans
= "{http://purl.org/dc/elements/1.1/}"
116 # récupère les listsets du serveur
117 store_listsets(options
)
119 url
= find_location (options
['url'])
122 root
= load_xml (url
.geturl() + "?verb=ListRecords&metadataPrefix=oai_dc")
123 records
.extend (root
.findall (".//%srecord" % oai2ns
))
124 token
= root
.find (".//%sresumptiontoken" % oai2ns
)
125 if token
is not None:
126 print "total du serveur %s " % token
.get("completeListSize")
128 while token
is not None:
129 root
= load_xml (url
.geturl() + "?verb=ListRecords&resumptionToken=%s" % token
.text
)
130 records
.extend (root
.findall (".//%srecord" % oai2ns
))
131 token
= root
.find (".//%sresumptiontoken" % oai2ns
)
135 for record
in records
:
137 node
= record
.find (".//%sheader/%sdatestamp" % (oai2ns
, oai2ns
))
139 meta
[DATE_MODIFIED
] = node
.text
141 dcnode
= record
.find (".//%s" % oaidc
)
142 if dcnode
is not None:
143 for c
in dcnode
.getchildren ():
145 c
.text
= c
.text
.strip ()
150 match
= map.get (c
.tag
.replace (metans
, ""), [])
151 if c
.tag
.replace(metans
, "") == "identifier" \
152 and not c
.text
.startswith("http"):
156 meta_set (meta
, field
, c
.text
)
158 #print meta, etree.tostring(record, pretty_print = True)
159 if meta
.get("uri") is None and meta
.get("source") is not None:
160 meta
['uri'] = meta
['source']
162 #un identifier doit être présent, s'il ne commence pas par http, prendre l'uri
163 #(i.e : aide-en-ligne fournit un identifier sous forme de titre)
164 if meta
.get("identifier") is None:
165 meta
['identifier'] = meta
.get('uri')
167 # récupère les listsets associés
168 listsets
= record
.findall (".//%sheader/%ssetspec" % (oai2ns
, oai2ns
))
169 meta
['listsets'] = [l
.text
for l
in listsets
]
171 if meta
.get("uri") is not None:
173 print "total récupérés %s" % len(nodes
)