2 # -*- coding: utf-8 -*-
4 Outil d'export dynamique de données w.c.s.
6 Copyright : Agence universitaire de la Francophonie — www.auf.org
7 Licence : GNU General Public Licence, version 2
8 Auteur : Jean Christophe André
9 Date de création : 13 mars 2013
11 Depends: wcs, python-simplejson, python-magic
14 - /dynexport => liste des formulaires pour le domaine courant
15 - /dynexport/domains.json => liste des domaines disponibles
16 - /dynexport/formulaire => liste des options ci-dessous
17 - /dynexport/formulaire/fields.json
18 - /dynexport/formulaire/field-names.json
19 - /dynexport/formulaire/field-names.txt
20 - /dynexport/formulaire/data.json
21 - /dynexport/formulaire/last-run.log
22 - /dynexport/formulaire/liste-dossiers.json
23 - /dynexport/formulaire/clear-cache => vide le cache
24 - /dynexport/formulaire/data/nom-dossier.json
25 - /dynexport/formulaire/data/nom-dossier_attachement-1.xxx
26 - /dynexport/formulaire/data/nom-dossier_attachement-2.xxx
27 - /dynexport/formulaire/data/nom-dossier_attachement-…
33 import time # time, gmtime, strftime, strptime, struct_time
34 import simplejson as json
38 from cStringIO import StringIO
39 from gzip import GzipFile
42 EXPIRE_DELAY = 5 # maximum 5 secondes en cache web
43 TIME_FORMAT = '%a, %d %b %Y %H:%M:%S GMT' # format date pour HTTP
44 #ETABLISSEMENT_FORMAT = r'^(\w+\s-\s.+)\s\(\d+\s-\s(Nord|Sud)\)$'
46 WCS_ROOT_DIR = '/var/lib/wcs'
47 WCS_DOMAIN_SUFFIX = '.auf.org'
48 WCS_CACHE_DIR = '/var/tmp'
49 WCS_CACHE_DELAY_DEFAULT = 7*24*60*60 # 1 semaine
50 WCS_CACHE_DELAY_FORMS = 5*60 # 5 minutes
52 #--------------------------------------------------------------------------
54 #--------------------------------------------------------------------------
58 #--------------------------------------------------------------------------
59 # fonctions de traitement
60 #--------------------------------------------------------------------------
62 def http_redirect(location, code='302'):
64 headers['Content-Type'] = 'text/plain; charset=utf-8'
65 headers['Status'] = '302 Redirection'
66 headers['Location'] = location
67 data = """If you see this, it means the automatic redirection has failed.
68 Please go to ${location}"""
70 headers = ''.join(map(lambda x: "%s: %s\r\n" % (x, headers[x]), headers))
71 f = open('/dev/stdout', 'wb')
72 f.write(headers + "\r\n")
79 def http_reply_and_exit(data, mime_type='text/html', charset='utf-8'):
80 if data is None: data = ''
82 current_time = time.time()
83 mtime = time.gmtime(current_time)
84 etime = time.gmtime(current_time + EXPIRE_DELAY)
85 if os.environ.has_key('HTTP_IF_MODIFIED_SINCE'):
87 itime = time.strptime(os.environ['HTTP_IF_MODIFIED_SINCE'], TIME_FORMAT)
92 # préparation des en-têtes et données
94 headers['Content-Type'] = '%s; charset=%s' % (mime_type, charset)
95 headers['Last-Modified'] = time.strftime(TIME_FORMAT, mtime)
96 headers['Expires'] = time.strftime(TIME_FORMAT, etime)
97 if os.environ['REQUEST_METHOD'] == 'GET' and (not itime or mtime > itime):
98 # détermination de la version demandée (compressée ou non)
99 if os.environ.get('HTTP_ACCEPT_ENCODING','').split(',').count('gzip') > 0:
101 GzipFile('', 'w', 9, zdata).write(data)
102 data = zdata.getvalue()
103 headers['Content-Encoding'] = 'gzip'
104 headers['Vary'] = 'Content-Encoding'
105 headers['Content-Length'] = len(data)
108 # envoi de la réponse
109 headers = ''.join(map(lambda x: "%s: %s\r\n" % (x, headers[x]), headers))
110 f = open('/dev/stdout', 'wb')
111 f.write(headers + "\r\n")
115 # arrêt du traitement
119 def _reduce_to_alnum(s, replacement_char='-'):
120 """réduction d'une chaîne de caractères à de l'alpha-numérique"""
122 if type(s) is not unicode:
123 s = unicode(s, 'utf-8')
124 s = unicodedata.normalize('NFKD', s).encode('ASCII', 'ignore')
127 if ('a' <= c.lower() <= 'z') or ('0' <= c <= '9'):
129 elif len(r) > 0 and r[-1] != replacement_char:
130 r += replacement_char
131 else: # r == '' or r[-1] == replacement_char
133 return r.strip(replacement_char)
135 def _make_wcs_cache_name(domain, form, name):
136 return 'wcs-%s-%s-%s' % (domain, form, name)
138 def set_wcs_cache(domain, form, name, data, delay=WCS_CACHE_DELAY_DEFAULT):
140 cache_filename = _make_wcs_cache_name(domain, form, name)
141 cache_filename = os.path.join(WCS_CACHE_DIR, cache_filename)
142 f = open(cache_filename, 'wb')
145 # la date de modification est utilisée comme date d'expiration
147 mtime = atime + delay
148 os.utime(cache_filename, (atime, mtime))
150 def get_wcs_cache(domain, form, name):
152 cache_filename = _make_wcs_cache_name(domain, form, name)
153 cache_filename = os.path.join(WCS_CACHE_DIR, cache_filename)
154 if os.path.exists(cache_filename):
155 # la date de modification est utilisée comme date d'expiration
156 if time.time() < os.path.getmtime(cache_filename):
157 data = open(cache_filename, 'rb').read()
159 os.unlink(cache_filename)
162 def clear_wcs_cache(domain, form):
163 cache_filename = _make_wcs_cache_name(domain, form, '')
164 for f in os.listdir(WCS_CACHE_DIR):
165 if f.startswith(cache_filename):
166 os.unlink(os.path.join(WCS_CACHE_DIR, f))
168 def set_wcs_publisher(domain):
171 from wcs import publisher
172 pub = publisher.WcsPublisher.create_publisher()
173 pub.app_dir = os.path.join(pub.app_dir, domain)
176 def get_wcs_domains():
178 suffix = WCS_DOMAIN_SUFFIX
183 return [x for x in l if os.path.isdir(os.path.join(root, x)) and x.endswith(suffix)]
185 def get_wcs_forms(domain):
186 """extraction de la liste des formulaires"""
187 data = get_wcs_cache(domain, 'ALL', 'ALL.json')
189 return json.loads(data, encoding='utf-8')
190 set_wcs_publisher(domain)
191 from wcs.formdef import FormDef
192 forms = [f.url_name for i,f in FormDef.items()]
193 data = json.dumps(forms, ensure_ascii=False).encode('utf-8')
194 set_wcs_cache(domain, 'ALL', 'ALL.json', data, WCS_CACHE_DELAY_FORMS)
197 def get_wcs_form_data(domain, form):
198 """extraction des données du formulaire"""
199 data = get_wcs_cache(domain, form, 'metadata.json')
201 return json.loads(data, encoding='utf-8')
202 # dictionnaire des metadonnées (qui seront mises en cache)
206 logname = _make_wcs_cache_name(domain, form, 'last-run.log')
207 logger = logging.getLogger('wcs-dynexport')
208 logger.setLevel(logging.DEBUG)
209 log_formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
210 log_handler = logging.FileHandler(os.path.join(WCS_CACHE_DIR, logname))
211 log_handler.setLevel(logging.DEBUG)
212 log_handler.setFormatter(log_formatter)
213 logger.addHandler(log_handler)
215 logger.info('Début.')
218 set_wcs_publisher(domain)
219 from wcs.formdef import FormDef
220 from wcs.fields import TitleField, CommentField, TextField, \
221 StringField, ItemField, ItemsField, EmailField, \
222 DateField, FileField, BoolField, TableField
223 formdef = FormDef.get_by_urlname(form)
225 # nommage des champs de façon unique
228 field_names_duplicates = {}
229 for i, field in enumerate(formdef.fields):
230 if isinstance(field, TitleField) or isinstance(field, CommentField):
235 name = _reduce_to_alnum(field.label,'_').lower()
236 if name in field_names.values(): # duplicat
237 field_names_duplicates[name] = field_names_duplicates.get(name, 1) + 1
238 name = '%s_%d' % (name, field_names_duplicates[name])
239 field_names.update({field.id: name})
240 fields.update({field.id: {'index': i, 'name': field_names[field.id], 'label': field.label, 'varname': field.varname and field.varname or ''}})
242 data = json.dumps(fields, ensure_ascii=False).encode('utf-8')
243 set_wcs_cache(domain, form, 'fields.json', data)
244 metadata.update({'fields': fields})
246 # on charge la base des types MIME une fois pour toutes
247 #magicmime = magic.Magic(mime=True) => ce sera pour plus tard…
248 magicmime = magic.open(magic.MAGIC_MIME)
252 liste_attachements = {}
253 for object in formdef.data_class().select():
254 if object.user is None:
255 logger.warning("Dossier '%s' sans utilisateur associé ?!?"\
256 " On ignore...", object.id)
260 workflow_status = object.status.startswith('wf-') and \
261 object.get_workflow_status().name or None
263 workflow_status = None
266 'num_dossier': object.id,
267 'wcs_status': object.status,
268 'wcs_workflow_status': workflow_status,
269 'wcs_user_email': object.user.email,
270 'wcs_user_display_name': object.user.display_name,
271 #'wcs_last_modified': time.strftime('%Y-%m-%d %H:%M:%S', time.gmtime(object.last_modified())),
275 if object.evolution is not None:
276 for e in object.evolution:
277 if e.comment is not None:
279 who = pub.user_class.get(e.who).display_name
282 e_time = time.strftime('%Y-%m-%d %H:%M:%S', e.time)
283 comment = '%s -- %s %s' % (e.comment, who, e_time)
284 result['wcs_comments'].append(comment)
287 for field in formdef.fields:
288 field_id = str(field.id)
289 if not field_id in object.data:
291 if isinstance(field, TitleField) or isinstance(field, CommentField):
293 field_name = fields[field_id]['name']
294 data = object.data.get(field_id)
296 result[field_name] = None
298 if isinstance(field, StringField) or isinstance(field, TextField) \
299 or isinstance(field, EmailField) or isinstance(field, ItemField):
300 # nettoyage du nom d'établissement (suppression id et Nord/Sud)
301 #m = match(ETABLISSEMENT_FORMAT, data)
303 # data = m.groups()[0]
304 result[field_name] = data
305 elif isinstance(field, ItemsField) or isinstance(field, TableField):
306 result[field_name] = data # liste => peux-être joindre sur ';'
307 elif isinstance(field, BoolField):
308 result[field_name] = (data == 'True')
309 elif isinstance(field, DateField):
310 if isinstance(data, time.struct_time):
311 result[field_name] = '%04d-%02d-%02d' % (data.tm_year,
312 data.tm_mon, data.tm_mday)
314 result[field_name] = data
315 elif isinstance(field, FileField):
316 if '.' in data.orig_filename:
317 extension = data.orig_filename.rpartition('.')[2].lower()
318 else: # il n'y a pas d'extension dans le nom de fichier
319 p = os.path.join(pub.app_dir, 'uploads', data.qfilename)
321 #m = magicmime.from_file(p) => ce sera pour plus tard…
322 m = magicmime.file(p).split()[0].strip(';')
323 extension = mimetypes.guess_extension(m)
325 logger.warning("Type de fichier inconnu pour '%s'.", p)
327 if extension is not None:
328 extension = extension[1:]
330 extension = 'unknown'
331 result[field_name] = "%s.%s" % (field_name, extension)
332 qfiles[field_name] = data.qfilename
334 logger.warning("Type de champ inconnu '%s' pour '%s' (%s).",
335 field.__class__.__name__, field_name, field.label)
337 num_dossier = result['num_dossier']
338 nom = _reduce_to_alnum(result.get('nom','sans-nom')).upper()
339 prenom = _reduce_to_alnum(result.get('prenom','sans-prenom')).upper()
340 adel = result.get('adresse_electronique','sans-adel').replace('@','-').lower()
342 filename = "%04d-%s-%s-%s" % (num_dossier, nom, prenom, adel)
343 liste_dossiers.append(filename + '.json')
345 # sauvegarde des chemins d'accès aux fichiers joints
347 dst = filename + '_' + result[f]
348 src = os.path.join(pub.app_dir, 'uploads', qfiles[f])
349 liste_attachements.update({dst: src})
350 # on renomme le fichier joint indiqué dans le dossier
353 # génération du fichier JSON
354 data = json.dumps(result, ensure_ascii=False).encode('utf-8')
355 set_wcs_cache(domain, form, 'data_%s.json' % filename, data)
357 logger.info("Dossier '%s' : %s.",
358 filename, result['wcs_workflow_status'])
360 data = json.dumps(liste_attachements, ensure_ascii=False).encode('utf-8')
361 set_wcs_cache(domain, form, 'data-files.json', data)
362 metadata.update({'attachements': liste_attachements})
364 liste_dossiers.sort()
365 data = json.dumps(liste_dossiers, ensure_ascii=False).encode('utf-8')
366 set_wcs_cache(domain, form, 'liste-dossiers.json', data)
367 metadata.update({'dossiers': liste_dossiers})
372 data = json.dumps(metadata, ensure_ascii=False).encode('utf-8')
373 set_wcs_cache(domain, form, 'metadata.json', data)
375 #if __name__ == '__main__':
377 # extract_data(formdef, OUTPUT_DIRECTORY)
379 # logger.exception("Interruption du traitement pour cause d'erreur !")
381 #--------------------------------------------------------------------------
382 # gestion des requêtes web
383 #--------------------------------------------------------------------------
386 #for k in sorted(os.environ):
387 # l.append('%s=%s\n' % (k, os.environ[k]))
389 #http_reply_and_exit(data, 'text/plain')
391 domain = os.environ.get('HTTP_HOST', '')
392 if domain not in get_wcs_domains():
393 http_reply_and_exit("Domaine '%s' inconnu." % domain, 'text/plain')
395 path_info = os.environ.get('PATH_INFO', '')
397 path_prefix = os.environ.get('REQUEST_URI', '')
398 if len(path_info) > 0:
399 path_prefix = path_prefix[:-len(path_info)]
402 http_redirect(path_prefix + '/')
405 # liste des formulaires disponibles
406 l = sorted(get_wcs_forms(domain))
407 l = ['<li><a href="%s/">%s</a></li>' % (f, f) for f in l]
408 title = '<p>Liste des formulaires disponibles :</p>\n'
409 data = '<html>\n' + title + '<ul>\n' + '\n'.join(l) + '\n</ul>\n</html>'
410 http_reply_and_exit(data, 'text/html')
412 if path_info == '/index.json':
413 # liste des formulaires disponibles
414 l = sorted(get_wcs_forms(domain))
415 data = json.dumps(l, ensure_ascii=False, indent=' ').encode('utf-8')
416 http_reply_and_exit(data, 'application/json')
418 if path_info == '/domains.json':
419 # liste des domaines disponibles
420 l = get_wcs_domains()
421 data = json.dumps(l, ensure_ascii=False, indent=' ').encode('utf-8')
422 http_reply_and_exit(data, 'application/json')
424 if match(r'^/[a-z0-9-]+$', path_info):
425 http_redirect(path_prefix + path_info + '/')
427 if match(r'^/[a-z0-9-]+/$', path_info):
428 form = path_info.split('/')[1]
429 if form not in get_wcs_forms(domain):
430 http_reply_and_exit("Formulaire '%s' inconnu." % form, 'text/plain')
431 l = [ 'fields.json', 'field-names.json', 'field-names.txt', 'last-run.log', 'liste-dossiers.json' ]
432 l = ['<li><a href="%s">%s</a></li>' % (f, f) for f in l]
433 title = '<p>Liste des informations disponibles :</p>\n'
434 action1 = """<p><a href="data/">Export des données</a></p>\n"""
435 action2 = """<p><a href="clear-cache">Suppression du cache</a> (pour ré-export)</p>\n"""
436 data = '<html>\n' + title + '<ul>\n' + '\n'.join(l) + '\n</ul>\n' + action1 + action2 + '</html>'
437 http_reply_and_exit(data, 'text/html')
439 if match(r'^/[a-z0-9-]+/index.json$', path_info):
440 form = path_info.split('/')[1]
441 if form not in get_wcs_forms(domain):
442 http_reply_and_exit("Formulaire '%s' inconnu." % form, 'text/plain')
443 l = [ 'fields.json', 'field-names.json', 'field-names.txt', 'last-run.log', 'liste-dossiers.json', 'data', 'clear-cache' ]
444 data = json.dumps(l, ensure_ascii=False, indent=' ').encode('utf-8')
445 http_reply_and_exit(data, 'application/json')
447 if match(r'^/[a-z0-9-]+/clear-cache$', path_info):
448 form = path_info.split('/')[1]
449 if form not in get_wcs_forms(domain):
450 http_reply_and_exit("Formulaire '%s' inconnu." % form, 'text/plain')
451 clear_wcs_cache(domain, form)
452 http_reply_and_exit('Ok.', 'text/plain')
454 if match(r'^/[a-z0-9-]+/fields.json$', path_info):
455 form = path_info.split('/')[1]
456 if form not in get_wcs_forms(domain):
457 http_reply_and_exit("Formulaire '%s' inconnu." % form, 'text/plain')
458 get_wcs_form_data(domain, form)
459 d = json.loads(get_wcs_cache(domain, form, 'fields.json'), encoding='utf-8')
460 data = json.dumps(d, ensure_ascii=False, indent=' ').encode('utf-8')
461 http_reply_and_exit(data, 'application/json')
463 if match(r'^/[a-z0-9-]+/field-names.json$', path_info):
464 form = path_info.split('/')[1]
465 if form not in get_wcs_forms(domain):
466 http_reply_and_exit("Formulaire '%s' inconnu." % form, 'text/plain')
467 get_wcs_form_data(domain, form)
468 d = json.loads(get_wcs_cache(domain, form, 'fields.json'), encoding='utf-8')
469 d = dict([(k, d[k]['name']) for k in d])
470 data = json.dumps(d, ensure_ascii=False, indent=' ').encode('utf-8')
471 http_reply_and_exit(data, 'application/json')
473 if match(r'^/[a-z0-9-]+/field-names.txt$', path_info):
474 form = path_info.split('/')[1]
475 if form not in get_wcs_forms(domain):
476 http_reply_and_exit("Formulaire '%s' inconnu." % form, 'text/plain')
477 get_wcs_form_data(domain, form)
478 d = json.loads(get_wcs_cache(domain, form, 'fields.json'), encoding='utf-8')
479 d = [(k, d[k]['name'], d[k]['label']) for k in d]
480 d = sorted(d, key=lambda x: int(x[0]))
481 text = u''.join([u'%s:%s:%s\n' % (x[0], x[1], x[2]) for x in d])
482 data = text.encode('utf-8')
483 http_reply_and_exit(data, 'text/plain')
485 if match(r'^/[a-z0-9-]+/last-run.log$', path_info):
486 form = path_info.split('/')[1]
487 if form not in get_wcs_forms(domain):
488 http_reply_and_exit("Formulaire '%s' inconnu." % form, 'text/plain')
489 get_wcs_form_data(domain, form)
490 data = get_wcs_cache(domain, form, 'last-run.log')
491 http_reply_and_exit(data, 'text/plain')
493 if match(r'^/[a-z0-9-]+/liste-dossiers.json$', path_info):
494 form = path_info.split('/')[1]
495 if form not in get_wcs_forms(domain):
496 http_reply_and_exit("Formulaire '%s' inconnu." % form, 'text/plain')
497 get_wcs_form_data(domain, form)
498 data = json.loads(get_wcs_cache(domain, form, 'liste-dossiers.json'), encoding='utf-8')
499 data = json.dumps(data, ensure_ascii=False, indent=' ').encode('utf-8')
500 http_reply_and_exit(data, 'application/json')
502 if match(r'^/[a-z0-9-]+/data$', path_info):
503 http_redirect(path_prefix + path_info + '/')
505 if match(r'^/[a-z0-9-]+/data/$', path_info):
506 form = path_info.split('/')[1]
507 if form not in get_wcs_forms(domain):
508 http_reply_and_exit("Formulaire '%s' inconnu." % form, 'text/plain')
509 get_wcs_form_data(domain, form)
510 dossiers = json.loads(get_wcs_cache(domain, form, 'liste-dossiers.json'), encoding='utf-8')
511 attachements = json.loads(get_wcs_cache(domain, form, 'data-files.json'), encoding='utf-8')
512 l = sorted(dossiers + attachements.keys())
514 l = ['<li><a href="%s">%s</a></li>' % (f, f) for f in l]
515 title = '<p>Liste des documents disponibles :</p>\n'
516 data = '<html>\n' + title + '<ul>\n' + '\n'.join(l) + '\n</ul>\n</html>'
518 data = '<html>\n<p>Aucun document disponible.</p>\n</html>'
519 http_reply_and_exit(data, 'text/html')
521 if match(r'^/[a-z0-9-]+/data/index.json$', path_info):
522 form = path_info.split('/')[1]
523 if form not in get_wcs_forms(domain):
524 http_reply_and_exit("Formulaire '%s' inconnu." % form, 'text/plain')
525 get_wcs_form_data(domain, form)
526 dossiers = json.loads(get_wcs_cache(domain, form, 'liste-dossiers.json'), encoding='utf-8')
527 attachements = json.loads(get_wcs_cache(domain, form, 'data-files.json'), encoding='utf-8')
528 l = sorted(dossiers + attachements.keys())
529 data = json.dumps(l, ensure_ascii=False, indent=' ').encode('utf-8')
530 http_reply_and_exit(data, 'application/json')
532 if match(r'^/[a-z0-9-]+/data/[^/]+$', path_info):
533 form = path_info.split('/')[1]
534 if form not in get_wcs_forms(domain):
535 http_reply_and_exit("Formulaire '%s' inconnu." % form, 'text/plain')
536 get_wcs_form_data(domain, form)
537 doc = path_info.split('/')[3]
538 dossiers = json.loads(get_wcs_cache(domain, form, 'liste-dossiers.json'), encoding='utf-8')
540 data = get_wcs_cache(domain, form, 'data_' + doc)
541 data = json.loads(data, encoding='utf-8')
542 data = json.dumps(data, ensure_ascii=False, indent=' ').encode('utf-8')
543 http_reply_and_exit(data, 'application/json')
544 attachements = json.loads(get_wcs_cache(domain, form, 'data-files.json'), encoding='utf-8')
545 if doc in attachements:
546 data = open(attachements[doc], 'rb').read()
547 mime_type = mimetypes.guess_type(doc)[0]
548 if mime_type is None:
549 mime_type = 'application/octet-stream'
550 http_reply_and_exit(data, mime_type)
551 http_reply_and_exit("Document '%s' inconnu." % path_info, 'text/plain')
553 http_reply_and_exit("Requête '%s' inconnue." % path_info, 'text/plain')