import re
from django.contrib import admin
from django.utils.safestring import mark_safe
-from models import SourceActualite, Actualite, Discipline, Evenement, Record, HarvestLog
+from models import SourceActualite, Actualite, Discipline, Evenement, Record, ListSet, HarvestLog
from savoirs.globals import META
from savoirs.lib.backend import Backend
else:
output = unicode(self.original_value)
+ # pour les relations
+ try:
+ output = ", ".join([ls.name for ls in self.original_value.get_query_set()])
+ except:
+ pass
+
is_url = re.match('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', output)
if is_url:
output = "<a target='_blank' href='%s'>%s</a>" % (output, output)
+
return mark_safe(output)
def value_from_datadict(self, data, files, name):
'type',
'format',
'language',
+ 'listsets',
'disciplines',
'thematiques',
]
et les champs en lecture seule uniquement."""
self.search_fields = META.keys()
self.readonly_fields = META.keys()
+ self.readonly_fields.append('listsets')
super(RecordAdmin, self).__init__(*args, **kwargs)
def _uri(self, obj):
admin.site.register(Record, RecordAdmin)
+class ListSetAdmin(ReadOnlyAdminFields, admin.ModelAdmin):
+ fields = ['spec', 'name', 'server', 'hidden' ]
+ list_display = fields
+ readonly_fields = ['spec', 'name', 'server',]
+ list_filter = ('server',)
+
+admin.site.register(ListSet, ListSetAdmin)
+
class HarvestLogAdmin(ReadOnlyAdminFields, admin.ModelAdmin):
fields = ['context', 'name', 'added', 'updated', 'record']
list_display = fields + ['date']
for name in resources.keys ():
print "Import:", name
options = RESOURCES[name]
+ options['server'] = name
+
module = 'harvesters.%s.%s' \
% (options['type'], options['acces'])
__import__ (module)
# -*- encoding: utf-8 -*-
+import sys
from lxml import etree
from urlparse import urlparse, urljoin
import sys, urllib, httplib, re, chardet
+from auf_savoirs_en_partage.savoirs.models import ListSet
from auf_savoirs_en_partage.savoirs.globals import *
from auf_savoirs_en_partage.savoirs.lib.utils \
import safe_append, print_structure, meta_set
def find_location (url_str):
url = urlparse(url_str)
- possible = ("perl/oai2", "cgi/oai2", "cgi-bin/oaiserver", "oai/oai.php")
+ possible = ("perl/oai2", "cgi/oai2", "cgi-bin/oaiserver", "oai/oai.php", "oai/oai2.php")
for test in possible:
path = url.path + test
handle = connect(url)
# Other crap
content.replace("&", "&")
- return etree.XML (content.encode("utf-8"))
+ try:
+ return etree.XML (content.encode("utf-8"))
+ except:
+ print "Erreur parser"
+ print original
+ sys.exit()
+
+def store_listsets(options):
+ """interroge le serveur pour récupérer tous les listsets et les stocke en bd."""
+
+ oai2ns = "{http://www.openarchives.org/OAI/2.0/}"
+ url = find_location (options['url'])
+ root = load_xml (url.geturl() + "?verb=ListSets")
+ sets = root.findall (".//%sset" % oai2ns)
+
+ listsets = [{'spec':e[0].text , 'name':e[1].text, 'server':options['server']} for e in sets]
+ for data in listsets:
+ ls, created = ListSet.objects.get_or_create(spec = data['spec'])
+ del data['spec']
+ for k,v in data.items():
+ setattr(ls, k, v)
+ ls.save()
def harvest (options):
oaidc = "{http://www.openarchives.org/OAI/2.0/oai_dc/}dc"
metans = "{http://purl.org/dc/elements/1.1/}"
+ # récupère les listsets du serveur
+ store_listsets(options)
+
url = find_location (options['url'])
records = []
root = load_xml (url.geturl() + "?verb=ListRecords&metadataPrefix=oai_dc")
records.extend (root.findall (".//%srecord" % oai2ns))
- token = root.find (".//%sresumptionToken" % oai2ns)
+ token = root.find (".//%sresumptiontoken" % oai2ns)
+ print "total du serveur %s " % token.get("completeListSize")
while token is not None:
root = load_xml (url.geturl() + "?verb=ListRecords&resumptionToken=%s" % token.text)
records.extend (root.findall (".//%srecord" % oai2ns))
- token = root.find (".//%sresumptionToken" % oai2ns)
+ token = root.find (".//%sresumptiontoken" % oai2ns)
nodes = []
for record in records:
meta = {}
node = record.find (".//%sheader/%sdatestamp" % (oai2ns, oai2ns))
+
meta[DATE_MODIFIED] = node.text
dcnode = record.find (".//%s" % oaidc)
if meta.get("uri") is None and meta.get("source") is not None:
meta['uri'] = meta['source']
+ # récupère les listsets associés
+ listsets = record.findall (".//%sheader/%ssetspec" % (oai2ns, oai2ns))
+ meta['listsets'] = [l.text for l in listsets]
+
if meta.get("uri") is not None:
nodes.append (meta)
-
+ print "total récupérés %s" % len(nodes)
return nodes
# -*- encoding: utf-8 -*-
import simplejson, re, datetime, operator, hashlib
from savoirs.globals import *
-from savoirs.models import Record
+from savoirs.models import Record, ListSet
class SEPEncoder:
"""
meta[k] = self.encoder.decode(k, v)
return meta
+ # traitement spécial pour certaines clef de la structure
+ def listsets(self, record, value):
+
+ # doit avoir un id pour créer les relations multivaluées
+ record.save()
+ for set in [ls for ls in ListSet.objects.all() if ls.spec in value]:
+ record.listsets.add(set)
+
def _save (self, metadata):
r = Record ()
for k in metadata.keys ():
- setattr (r, k, self.encoder.encode(k, metadata[k]))
+ if hasattr(self, k):
+ method = getattr(self, k)
+ method(r, metadata[k])
+ else:
+ setattr (r, k, self.encoder.encode(k, metadata[k]))
+
r.last_checksum = hashlib.md5(str(metadata)).hexdigest()
r.last_update = datetime.datetime.today()
r.save()
return False
for k in metadata.keys ():
- setattr (r, k, self.encoder.encode(k, metadata[k]))
+ if hasattr(self, k):
+ method = getattr(self, k)
+ method(r, metadata[k])
+ else:
+ setattr (r, k, self.encoder.encode(k, metadata[k]))
r.last_update = datetime.datetime.today()
r.save()
matches.append ("MATCH(`%s`) AGAINST ('%s'%s)" % (k, " ".join(words), suffix))
m = "+".join (matches)
- q = "SELECT id, (" + m + ") AS score FROM savoirs_record WHERE (" \
- + m + ") HAVING score > 0 ORDER BY score DESC"
+ q = "SELECT id, (%s) AS score FROM savoirs_record \
+ WHERE (%s) HAVING score > 0 ORDER BY score DESC" % (m, m)
from django.db import connection, transaction
cursor = connection.cursor()
objects = ActiveManager()
+class ListSet(models.Model):
+ spec = models.CharField(primary_key = True, max_length = 255)
+ name = models.CharField(max_length = 255)
+ server = models.CharField(max_length = 255)
+ hidden = models.BooleanField(default = False)
+
class Record(models.Model):
#fonctionnement interne
format = models.TextField(null = True, blank = True)
language = models.TextField(null = True, blank = True)
+ listsets = models.ManyToManyField(ListSet)
+
#SEP 2 (aucune données récoltées)
alt_title = models.TextField(null = True, blank = True)
abstract = models.TextField(null = True, blank = True)
disciplines = models.ManyToManyField(Discipline)
thematiques = models.ManyToManyField(Thematique)
+
def __unicode__(self):
return "R[%s] %s" % (self.id, self.title)