#!/usr/bin/python # -*- coding: utf-8 -*- """ Outil d'export dynamique de données w.c.s. Copyright : Agence universitaire de la Francophonie — www.auf.org Licence : GNU General Public Licence, version 2 Auteur : Jean Christophe André Date de création : 13 mars 2013 Depends: wcs, python-simplejson, python-magic URL d'accès : - /dynexport => liste des formulaires pour le domaine courant - /dynexport/domains.json => liste des domaines disponibles - /dynexport/formulaire => liste des options ci-dessous - /dynexport/formulaire/fields.json - /dynexport/formulaire/field-names.json - /dynexport/formulaire/field-names.txt - /dynexport/formulaire/data.json - /dynexport/formulaire/last-run.log - /dynexport/formulaire/liste-dossiers.json - /dynexport/formulaire/clear-cache => vide le cache - /dynexport/formulaire/data/nom-dossier.json - /dynexport/formulaire/data/nom-dossier_attachement-1.xxx - /dynexport/formulaire/data/nom-dossier_attachement-2.xxx - /dynexport/formulaire/data/nom-dossier_attachement-… """ import sys import os import os.path import logging import time # time, gmtime, strftime, strptime, struct_time import simplejson as json import magic import mimetypes import unicodedata from cStringIO import StringIO from gzip import GzipFile from re import match EXPIRE_DELAY = 5 # maximum 5 secondes en cache web TIME_FORMAT = '%a, %d %b %Y %H:%M:%S GMT' # format date pour HTTP #ETABLISSEMENT_FORMAT = r'^(\w+\s-\s.+)\s\(\d+\s-\s(Nord|Sud)\)$' WCS_ROOT_DIR = '/var/lib/wcs' WCS_DOMAIN_SUFFIX = '.auf.org' WCS_CACHE_DIR = '/var/tmp' WCS_CACHE_DELAY_DEFAULT = 7*24*60*60 # 1 semaine WCS_CACHE_DELAY_FORMS = 5*60 # 5 minutes #-------------------------------------------------------------------------- # variables globales #-------------------------------------------------------------------------- pub = None #-------------------------------------------------------------------------- # fonctions de traitement #-------------------------------------------------------------------------- def http_redirect(location, code='302'): headers = {} headers['Content-Type'] = 'text/plain; charset=utf-8' headers['Status'] = '302 Redirection' headers['Location'] = location data = """If you see this, it means the automatic redirection has failed. Please go to ${location}""" # envoi de la réponse headers = ''.join(map(lambda x: "%s: %s\r\n" % (x, headers[x]), headers)) f = open('/dev/stdout', 'wb') f.write(headers + "\r\n") if data: f.write(data) f.flush() # arrêt du traitement sys.exit(0) def http_reply_and_exit(data, mime_type='text/html', charset='utf-8'): # références horaires current_time = time.time() mtime = time.gmtime(current_time) etime = time.gmtime(current_time + EXPIRE_DELAY) if os.environ.has_key('HTTP_IF_MODIFIED_SINCE'): try: itime = time.strptime(os.environ['HTTP_IF_MODIFIED_SINCE'], TIME_FORMAT) except ValueError: itime = None else: itime = None # préparation des en-têtes et données headers = {} headers['Content-Type'] = '%s; charset=%s' % (mime_type, charset) headers['Last-Modified'] = time.strftime(TIME_FORMAT, mtime) headers['Expires'] = time.strftime(TIME_FORMAT, etime) if os.environ['REQUEST_METHOD'] == 'GET' and (not itime or mtime > itime): # détermination de la version demandée (compressée ou non) if os.environ.get('HTTP_ACCEPT_ENCODING','').split(',').count('gzip') > 0: zdata = StringIO() GzipFile('', 'w', 9, zdata).write(data) data = zdata.getvalue() headers['Content-Encoding'] = 'gzip' headers['Vary'] = 'Content-Encoding' headers['Content-Length'] = len(data) else: data = None # envoi de la réponse headers = ''.join(map(lambda x: "%s: %s\r\n" % (x, headers[x]), headers)) f = open('/dev/stdout', 'wb') f.write(headers + "\r\n") if data: f.write(data) f.flush() # arrêt du traitement sys.exit(0) def _reduce_to_alnum(s, replacement_char='-'): """réduction d'une chaîne de caractères à de l'alpha-numérique""" if type(s) is not unicode: s = unicode(s, 'utf-8') s = unicodedata.normalize('NFKD', s).encode('ASCII', 'ignore') r = '' for c in s: if ('a' <= c.lower() <= 'z') or ('0' <= c <= '9'): r += c elif len(r) > 0 and r[-1] != replacement_char: r += replacement_char else: # r == '' or r[-1] == replacement_char pass return r.strip(replacement_char) def _make_wcs_cache_name(domain, form, name): return 'wcs-%s-%s-%s' % (domain, form, name) def set_wcs_cache(domain, form, name, data, delay=WCS_CACHE_DELAY_DEFAULT): os.umask(0022) cache_filename = _make_wcs_cache_name(domain, form, name) cache_filename = os.path.join(WCS_CACHE_DIR, cache_filename) f = open(cache_filename, 'wb') f.write(data) f.close() # la date de modification est utilisée comme date d'expiration atime = time.time() mtime = atime + delay os.utime(cache_filename, (atime, mtime)) def get_wcs_cache(domain, form, name): data = None cache_filename = _make_wcs_cache_name(domain, form, name) cache_filename = os.path.join(WCS_CACHE_DIR, cache_filename) if os.path.exists(cache_filename): # la date de modification est utilisée comme date d'expiration if time.time() < os.path.getmtime(cache_filename): data = open(cache_filename, 'rb').read() else: os.unlink(cache_filename) return data def clear_wcs_cache(domain, form): cache_filename = _make_wcs_cache_name(domain, form, '') for f in os.listdir(WCS_CACHE_DIR): if f.startswith(cache_filename): os.unlink(os.path.join(WCS_CACHE_DIR, f)) def set_wcs_publisher(domain): global pub if pub is None: from wcs import publisher pub = publisher.WcsPublisher.create_publisher() pub.app_dir = os.path.join(pub.app_dir, domain) pub.set_config() def get_wcs_domains(): root = WCS_ROOT_DIR suffix = WCS_DOMAIN_SUFFIX try: l = os.listdir(root) except OSError: return None return [x for x in l if os.path.isdir(os.path.join(root, x)) and x.endswith(suffix)] def get_wcs_forms(domain): """extraction de la liste des formulaires""" data = get_wcs_cache(domain, 'ALL', 'ALL.json') if data is not None: return json.loads(data, encoding='utf-8') set_wcs_publisher(domain) from wcs.formdef import FormDef forms = [f.url_name for i,f in FormDef.items()] data = json.dumps(forms, ensure_ascii=False).encode('utf-8') set_wcs_cache(domain, 'ALL', 'ALL.json', data, WCS_CACHE_DELAY_FORMS) return forms def get_wcs_form_data(domain, form): """extraction des données du formulaire""" data = get_wcs_cache(domain, form, 'metadata.json') if data is not None: return json.loads(data, encoding='utf-8') # dictionnaire des metadonnées (qui seront mises en cache) metadata = {} os.umask(0022) logname = _make_wcs_cache_name(domain, form, 'last-run.log') logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s', filename=os.path.join(WCS_CACHE_DIR, logname), filemode='w') logging.info('Début.') set_wcs_publisher(domain) from wcs.formdef import FormDef from wcs.fields import TitleField, CommentField, TextField, \ StringField, ItemField, ItemsField, EmailField, \ DateField, FileField, BoolField, TableField formdef = FormDef.get_by_urlname(form) # nommage des champs de façon unique fields = {} field_names = {} field_names_duplicates = {} for i, field in enumerate(formdef.fields): if isinstance(field, TitleField) or isinstance(field, CommentField): continue if field.varname: name = field.varname else: name = _reduce_to_alnum(field.label,'_').lower() if name in field_names.values(): # duplicat field_names_duplicates[name] = field_names_duplicates.get(name, 1) + 1 name = '%s_%d' % (name, field_names_duplicates[name]) field_names.update({field.id: name}) fields.update({field.id: {'index': i, 'name': field_names[field.id], 'label': field.label, 'varname': field.varname and field.varname or ''}}) data = json.dumps(fields, ensure_ascii=False).encode('utf-8') set_wcs_cache(domain, form, 'fields.json', data) metadata.update({'fields': fields}) # on charge la base des types MIME une fois pour toutes #magicmime = magic.Magic(mime=True) => ce sera pour plus tard… magicmime = magic.open(magic.MAGIC_MIME) magicmime.load() liste_dossiers = [] liste_attachements = {} for object in formdef.data_class().select(): if object.user is None: logging.warning("Dossier '%s' sans utilisateur associé ?!?"\ " On ignore...", object.id) continue try: workflow_status = object.status.startswith('wf-') and \ object.get_workflow_status().name or None except: workflow_status = None result = { 'num_dossier': object.id, 'wcs_status': object.status, 'wcs_workflow_status': workflow_status, 'wcs_user_email': object.user.email, 'wcs_user_display_name': object.user.display_name, #'wcs_last_modified': time.strftime('%Y-%m-%d %H:%M:%S', time.gmtime(object.last_modified())), 'wcs_comments': [], } if object.evolution is not None: for e in object.evolution: if e.comment is not None: try: who = pub.user_class.get(e.who).display_name except: who = 'Inconnu(e)' e_time = time.strftime('%Y-%m-%d %H:%M:%S', e.time) comment = '%s -- %s %s' % (e.comment, who, e_time) result['wcs_comments'].append(comment) qfiles = { } for field in formdef.fields: field_id = str(field.id) if not field_id in object.data: continue if isinstance(field, TitleField) or isinstance(field, CommentField): continue field_name = fields[field_id]['name'] data = object.data.get(field_id) if data is None: result[field_name] = None continue if isinstance(field, StringField) or isinstance(field, TextField) \ or isinstance(field, EmailField) or isinstance(field, ItemField): # nettoyage du nom d'établissement (suppression id et Nord/Sud) #m = match(ETABLISSEMENT_FORMAT, data) #if m is not None: # data = m.groups()[0] result[field_name] = data elif isinstance(field, ItemsField) or isinstance(field, TableField): result[field_name] = data # liste => peux-être joindre sur ';' elif isinstance(field, BoolField): result[field_name] = (data == 'True') elif isinstance(field, DateField): if isinstance(data, time.struct_time): result[field_name] = '%04d-%02d-%02d' % (data.tm_year, data.tm_mon, data.tm_mday) else: result[field_name] = data elif isinstance(field, FileField): if '.' in data.orig_filename: extension = data.orig_filename.rpartition('.')[2].lower() else: # il n'y a pas d'extension dans le nom de fichier p = os.path.join(pub.app_dir, 'uploads', data.qfilename) try: #m = magicmime.from_file(p) => ce sera pour plus tard… m = magicmime.file(p).split()[0].strip(';') extension = mimetypes.guess_extension(m) except: logging.warning("Type de fichier inconnu pour '%s'.", p) extension = None if extension is not None: extension = extension[1:] else: extension = 'unknown' result[field_name] = "%s.%s" % (field_name, extension) qfiles[field_name] = data.qfilename else: logging.warning("Type de champ inconnu '%s' pour '%s' (%s).", field.__class__.__name__, field_name, field.label) num_dossier = result['num_dossier'] nom = _reduce_to_alnum(result.get('nom','sans-nom')).upper() prenom = _reduce_to_alnum(result.get('prenom','sans-prenom')).upper() adel = result.get('adresse_electronique','sans-adel').replace('@','-').lower() filename = "%04d-%s-%s-%s" % (num_dossier, nom, prenom, adel) liste_dossiers.append(filename + '.json') # sauvegarde des chemins d'accès aux fichiers joints for f in qfiles: dst = filename + '_' + result[f] src = os.path.join(pub.app_dir, 'uploads', qfiles[f]) liste_attachements.update({dst: src}) # on renomme le fichier joint indiqué dans le dossier result[f] = dst # génération du fichier JSON data = json.dumps(result, ensure_ascii=False).encode('utf-8') set_wcs_cache(domain, form, 'data_%s.json' % filename, data) logging.info("Dossier '%s' : %s.", filename, result['wcs_workflow_status']) data = json.dumps(liste_attachements, ensure_ascii=False).encode('utf-8') set_wcs_cache(domain, form, 'data-files.json', data) metadata.update({'attachements': liste_attachements}) liste_dossiers.sort() data = json.dumps(liste_dossiers, ensure_ascii=False).encode('utf-8') set_wcs_cache(domain, form, 'liste-dossiers.json', data) metadata.update({'dossiers': liste_dossiers}) logging.info('Fin.') data = json.dumps(metadata, ensure_ascii=False).encode('utf-8') set_wcs_cache(domain, form, 'metadata.json', data) #if __name__ == '__main__': # try: # extract_data(formdef, OUTPUT_DIRECTORY) # except: # logging.exception("Interruption du traitement pour cause d'erreur !") #-------------------------------------------------------------------------- # gestion des requêtes web #-------------------------------------------------------------------------- #l = [] #for k in sorted(os.environ): # l.append('%s=%s\n' % (k, os.environ[k])) #data = ''.join(l) #http_reply_and_exit(data, 'text/plain') domain = os.environ.get('HTTP_HOST', '') if domain not in get_wcs_domains(): http_reply_and_exit("Domaine '%s' inconnu." % domain, 'text/plain') path_info = os.environ.get('PATH_INFO', '') path_prefix = os.environ.get('REQUEST_URI', '') if len(path_info) > 0: path_prefix = path_prefix[:-len(path_info)] if path_info == '': http_redirect(path_prefix + '/') if path_info == '/': # liste des formulaires disponibles l = sorted(get_wcs_forms(domain)) l = ['
  • %s
  • ' % (f, f) for f in l] title = '

    Liste des formulaires disponibles :

    \n' data = '\n' + title + '\n' http_reply_and_exit(data, 'text/html') if path_info == '/index.json': # liste des formulaires disponibles l = sorted(get_wcs_forms(domain)) data = json.dumps(l, ensure_ascii=False, indent=' ').encode('utf-8') http_reply_and_exit(data, 'application/json') if path_info == '/domains.json': # liste des domaines disponibles l = get_wcs_domains() data = json.dumps(l, ensure_ascii=False, indent=' ').encode('utf-8') http_reply_and_exit(data, 'application/json') if match(r'^/[a-z0-9-]+$', path_info): http_redirect(path_prefix + path_info + '/') if match(r'^/[a-z0-9-]+/$', path_info): form = path_info.split('/')[1] if form not in get_wcs_forms(domain): http_reply_and_exit("Formulaire '%s' inconnu." % form, 'text/plain') l = [ 'fields.json', 'field-names.json', 'field-names.txt', 'last-run.log', 'liste-dossiers.json' ] l = ['
  • %s
  • ' % (f, f) for f in l] title = '

    Liste des informations disponibles :

    \n' action1 = """

    Export des données

    \n""" action2 = """

    Suppression du cache (pour ré-export)

    \n""" data = '\n' + title + '\n' + action1 + action2 + '' http_reply_and_exit(data, 'text/html') if match(r'^/[a-z0-9-]+/index.json$', path_info): form = path_info.split('/')[1] if form not in get_wcs_forms(domain): http_reply_and_exit("Formulaire '%s' inconnu." % form, 'text/plain') l = [ 'fields.json', 'field-names.json', 'field-names.txt', 'last-run.log', 'liste-dossiers.json', 'data', 'clear-cache' ] data = json.dumps(l, ensure_ascii=False, indent=' ').encode('utf-8') http_reply_and_exit(data, 'application/json') if match(r'^/[a-z0-9-]+/clear-cache$', path_info): form = path_info.split('/')[1] if form not in get_wcs_forms(domain): http_reply_and_exit("Formulaire '%s' inconnu." % form, 'text/plain') clear_wcs_cache(domain, form) http_reply_and_exit('Ok.', 'text/plain') if match(r'^/[a-z0-9-]+/fields.json$', path_info): form = path_info.split('/')[1] if form not in get_wcs_forms(domain): http_reply_and_exit("Formulaire '%s' inconnu." % form, 'text/plain') get_wcs_form_data(domain, form) d = json.loads(get_wcs_cache(domain, form, 'fields.json'), encoding='utf-8') data = json.dumps(d, ensure_ascii=False, indent=' ').encode('utf-8') http_reply_and_exit(data, 'application/json') if match(r'^/[a-z0-9-]+/field-names.json$', path_info): form = path_info.split('/')[1] if form not in get_wcs_forms(domain): http_reply_and_exit("Formulaire '%s' inconnu." % form, 'text/plain') get_wcs_form_data(domain, form) d = json.loads(get_wcs_cache(domain, form, 'fields.json'), encoding='utf-8') d = dict([(k, d[k]['name']) for k in d]) data = json.dumps(d, ensure_ascii=False, indent=' ').encode('utf-8') http_reply_and_exit(data, 'application/json') if match(r'^/[a-z0-9-]+/field-names.txt$', path_info): form = path_info.split('/')[1] if form not in get_wcs_forms(domain): http_reply_and_exit("Formulaire '%s' inconnu." % form, 'text/plain') get_wcs_form_data(domain, form) d = json.loads(get_wcs_cache(domain, form, 'fields.json'), encoding='utf-8') d = [(k, d[k]['name'], d[k]['label']) for k in d] d = sorted(d, key=lambda x: int(x[0])) text = u''.join([u'%s:%s:%s\n' % (x[0], x[1], x[2]) for x in d]) data = text.encode('utf-8') http_reply_and_exit(data, 'text/plain') if match(r'^/[a-z0-9-]+/last-run.log$', path_info): form = path_info.split('/')[1] if form not in get_wcs_forms(domain): http_reply_and_exit("Formulaire '%s' inconnu." % form, 'text/plain') get_wcs_form_data(domain, form) data = get_wcs_cache(domain, form, 'last-run.log') http_reply_and_exit(data, 'text/plain') if match(r'^/[a-z0-9-]+/liste-dossiers.json$', path_info): form = path_info.split('/')[1] if form not in get_wcs_forms(domain): http_reply_and_exit("Formulaire '%s' inconnu." % form, 'text/plain') get_wcs_form_data(domain, form) data = json.loads(get_wcs_cache(domain, form, 'liste-dossiers.json'), encoding='utf-8') data = json.dumps(data, ensure_ascii=False, indent=' ').encode('utf-8') http_reply_and_exit(data, 'application/json') if match(r'^/[a-z0-9-]+/data$', path_info): http_redirect(path_prefix + path_info + '/') if match(r'^/[a-z0-9-]+/data/$', path_info): form = path_info.split('/')[1] if form not in get_wcs_forms(domain): http_reply_and_exit("Formulaire '%s' inconnu." % form, 'text/plain') get_wcs_form_data(domain, form) dossiers = json.loads(get_wcs_cache(domain, form, 'liste-dossiers.json'), encoding='utf-8') attachements = json.loads(get_wcs_cache(domain, form, 'data-files.json'), encoding='utf-8') l = sorted(dossiers + attachements.keys()) if len(l) > 0: l = ['
  • %s
  • ' % (f, f) for f in l] title = '

    Liste des documents disponibles :

    \n' data = '\n' + title + '\n' else: data = '\n

    Aucun document disponible.

    \n' http_reply_and_exit(data, 'text/html') if match(r'^/[a-z0-9-]+/data/index.json$', path_info): form = path_info.split('/')[1] if form not in get_wcs_forms(domain): http_reply_and_exit("Formulaire '%s' inconnu." % form, 'text/plain') get_wcs_form_data(domain, form) dossiers = json.loads(get_wcs_cache(domain, form, 'liste-dossiers.json'), encoding='utf-8') attachements = json.loads(get_wcs_cache(domain, form, 'data-files.json'), encoding='utf-8') l = sorted(dossiers + attachements.keys()) data = json.dumps(l, ensure_ascii=False, indent=' ').encode('utf-8') http_reply_and_exit(data, 'application/json') if match(r'^/[a-z0-9-]+/data/[^/]+$', path_info): form = path_info.split('/')[1] if form not in get_wcs_forms(domain): http_reply_and_exit("Formulaire '%s' inconnu." % form, 'text/plain') get_wcs_form_data(domain, form) doc = path_info.split('/')[3] dossiers = json.loads(get_wcs_cache(domain, form, 'liste-dossiers.json'), encoding='utf-8') if doc in dossiers: data = get_wcs_cache(domain, form, 'data_' + doc) data = json.loads(data, encoding='utf-8') data = json.dumps(data, ensure_ascii=False, indent=' ').encode('utf-8') http_reply_and_exit(data, 'application/json') attachements = json.loads(get_wcs_cache(domain, form, 'data-files.json'), encoding='utf-8') if doc in attachements: data = open(attachements[doc], 'rb').read() mime_type = mimetypes.guess_type(doc)[0] if mime_type is None: mime_type = 'application/octet-stream' http_reply_and_exit(data, mime_type) http_reply_and_exit("Document '%s' inconnu." % path_info, 'text/plain') http_reply_and_exit("Requête '%s' inconnue." % path_info, 'text/plain')