wcs-dynexport : intégration du basculement vers PostgreSQL
[progfou.git] / wcs / wcs-dynexport
CommitLineData
20ae1ad6
P
1#!/usr/bin/python
2# -*- coding: utf-8 -*-
3"""
4Outil d'export dynamique de données w.c.s.
5
6Copyright : Agence universitaire de la Francophonie — www.auf.org
7Licence : GNU General Public Licence, version 2
8Auteur : Jean Christophe André
9Date de création : 13 mars 2013
10
11Depends: wcs, python-simplejson, python-magic
12
13URL d'accès :
14- /dynexport => liste des formulaires pour le domaine courant
15- /dynexport/domains.json => liste des domaines disponibles
16- /dynexport/formulaire => liste des options ci-dessous
17- /dynexport/formulaire/fields.json
18- /dynexport/formulaire/field-names.json
19- /dynexport/formulaire/field-names.txt
20- /dynexport/formulaire/data.json
21- /dynexport/formulaire/last-run.log
22- /dynexport/formulaire/liste-dossiers.json
23- /dynexport/formulaire/clear-cache => vide le cache
24- /dynexport/formulaire/data/nom-dossier.json
25- /dynexport/formulaire/data/nom-dossier_attachement-1.xxx
26- /dynexport/formulaire/data/nom-dossier_attachement-2.xxx
27- /dynexport/formulaire/data/nom-dossier_attachement-…
28"""
29import sys
30import os
31import os.path
32import logging
33import time # time, gmtime, strftime, strptime, struct_time
34import simplejson as json
35import magic
36import mimetypes
37import unicodedata
38from cStringIO import StringIO
39from gzip import GzipFile
40from re import match
41
42DELAIS = 5 # maximum 5 secondes en cache
43TIME_FORMAT = '%a, %d %b %Y %H:%M:%S GMT' # format date pour HTTP
44
45WCS_ROOT_DIR = '/var/lib/wcs'
46WCS_DOMAIN_SUFFIX = '.auf.org'
20ae1ad6
P
47WCS_CACHE_DIR = '/var/tmp'
48
49#--------------------------------------------------------------------------
f2de318d
P
50# variables globales
51#--------------------------------------------------------------------------
52
53pub = None
54
55#--------------------------------------------------------------------------
20ae1ad6
P
56# fonctions de traitement
57#--------------------------------------------------------------------------
58
59def http_redirect(location, code='302'):
60 headers = {}
61 headers['Content-Type'] = 'text/plain; charset=utf-8'
62 headers['Status'] = '302 Redirection'
63 headers['Location'] = location
64 data = """If you see this, it means the automatic redirection has failed.
65Please go to ${location}"""
66 # envoi de la réponse
67 headers = ''.join(map(lambda x: "%s: %s\r\n" % (x, headers[x]), headers))
68 f = open('/dev/stdout', 'wb')
69 f.write(headers + "\r\n")
70 if data:
71 f.write(data)
72 f.flush()
73 # arrêt du traitement
74 sys.exit(0)
75
76def http_reply_and_exit(data, mime_type='text/html', charset='utf-8'):
77 # références horaires
78 current_time = time.time()
79 mtime = time.gmtime(current_time)
80 etime = time.gmtime(current_time + DELAIS)
81 if os.environ.has_key('HTTP_IF_MODIFIED_SINCE'):
82 try:
83 itime = time.strptime(os.environ['HTTP_IF_MODIFIED_SINCE'], TIME_FORMAT)
84 except ValueError:
85 itime = None
86 else:
87 itime = None
88 # préparation des en-têtes et données
89 headers = {}
90 headers['Content-Type'] = '%s; charset=%s' % (mime_type, charset)
91 headers['Last-Modified'] = time.strftime(TIME_FORMAT, mtime)
92 headers['Expires'] = time.strftime(TIME_FORMAT, etime)
93 if os.environ['REQUEST_METHOD'] == 'GET' and (not itime or mtime > itime):
94 # détermination de la version demandée (compressée ou non)
95 if os.environ.get('HTTP_ACCEPT_ENCODING','').split(',').count('gzip') > 0:
96 zdata = StringIO()
97 GzipFile('', 'w', 9, zdata).write(data)
98 data = zdata.getvalue()
99 headers['Content-Encoding'] = 'gzip'
100 headers['Vary'] = 'Content-Encoding'
101 headers['Content-Length'] = len(data)
102 else:
103 data = None
104 # envoi de la réponse
105 headers = ''.join(map(lambda x: "%s: %s\r\n" % (x, headers[x]), headers))
106 f = open('/dev/stdout', 'wb')
107 f.write(headers + "\r\n")
108 if data:
109 f.write(data)
110 f.flush()
111 # arrêt du traitement
112 sys.exit(0)
113
114
115def _reduce_to_alnum(s, replacement_char='-'):
116 """réduction d'une chaîne de caractères à de l'alpha-numérique"""
117
118 if type(s) is not unicode:
119 s = unicode(s, 'utf-8')
120 s = unicodedata.normalize('NFKD', s).encode('ASCII', 'ignore')
121 r = ''
122 for c in s:
123 if ('a' <= c.lower() <= 'z') or ('0' <= c <= '9'):
124 r += c
125 elif len(r) > 0 and r[-1] != replacement_char:
126 r += replacement_char
127 else: # r == '' or r[-1] == replacement_char
128 pass
129 return r.strip(replacement_char)
130
131def _make_wcs_cache_name(domain, form, name):
132 return 'wcs-%s-%s-%s' % (domain, form, name)
133
134def set_wcs_cache(domain, form, name, data):
135 os.umask(0022)
136 cache_filename = _make_wcs_cache_name(domain, form, name)
137 f = open(os.path.join(WCS_CACHE_DIR, cache_filename), 'wb')
138 f.write(data)
139 f.close()
140
141def get_wcs_cache(domain, form, name):
142 data = None
143 cache_filename = _make_wcs_cache_name(domain, form, name)
144 cache_filename = os.path.join(WCS_CACHE_DIR, cache_filename)
145 if os.path.exists(cache_filename):
146 f = open(cache_filename, 'rb')
147 data = f.read()
148 f.close()
149 return data
150
151def clear_wcs_cache(domain, form):
152 cache_filename = _make_wcs_cache_name(domain, form, '')
153 for f in os.listdir(WCS_CACHE_DIR):
154 if f.startswith(cache_filename):
155 os.unlink(os.path.join(WCS_CACHE_DIR, f))
156
f2de318d
P
157def set_wcs_publisher(domain):
158 global pub
159 if pub is None:
160 from wcs import publisher
161 pub = publisher.WcsPublisher.create_publisher()
162 pub.app_dir = os.path.join(pub.app_dir, domain)
163 pub.set_config()
164
20ae1ad6
P
165def get_wcs_domains():
166 root = WCS_ROOT_DIR
167 suffix = WCS_DOMAIN_SUFFIX
168 try:
169 l = os.listdir(root)
170 except OSError:
171 return None
172 return [x for x in l if os.path.isdir(os.path.join(root, x)) and x.endswith(suffix)]
173
174def get_wcs_forms(domain):
f2de318d
P
175 set_wcs_publisher(domain)
176 from wcs.formdef import FormDef
177 return [f.url_name for i,f in FormDef.items()]
20ae1ad6
P
178
179def get_wcs_form_data(domain, form):
180 """extraction des données du formulaire"""
181 data = get_wcs_cache(domain, form, 'metadata.json')
182 if data is not None:
183 return json.loads(data, encoding='utf-8')
184 # dictionnaire des metadonnées (qui seront mises en cache)
185 metadata = {}
186
187 os.umask(0022)
188 logname = _make_wcs_cache_name(domain, form, 'last-run.log')
189 logging.basicConfig(level=logging.DEBUG,
190 format='%(asctime)s %(levelname)s %(message)s',
191 filename=os.path.join(WCS_CACHE_DIR, logname),
192 filemode='w')
193
194 logging.info('Début.')
195
f2de318d 196 set_wcs_publisher(domain)
20ae1ad6
P
197 from wcs.formdef import FormDef
198 from wcs.fields import TitleField, CommentField, TextField, \
199 StringField, ItemField, ItemsField, EmailField, \
200 DateField, FileField, BoolField, TableField
20ae1ad6
P
201 formdef = FormDef.get_by_urlname(form)
202
203 # nommage des champs de façon unique
204 fields = {}
205 field_names = {}
206 field_names_duplicates = {}
33fcaa8a 207 for i, field in enumerate(formdef.fields):
20ae1ad6
P
208 if isinstance(field, TitleField) or isinstance(field, CommentField):
209 continue
210 if field.varname:
211 name = field.varname
212 else:
213 name = _reduce_to_alnum(field.label,'_').lower()
214 if name in field_names.values(): # duplicat
215 field_names_duplicates[name] = field_names_duplicates.get(name, 1) + 1
216 name = '%s_%d' % (name, field_names_duplicates[name])
217 field_names.update({field.id: name})
33fcaa8a 218 fields.update({field.id: {'index': i, 'name': field_names[field.id], 'label': field.label, 'varname': field.varname and field.varname or ''}})
20ae1ad6
P
219
220 data = json.dumps(fields, ensure_ascii=False).encode('utf-8')
221 set_wcs_cache(domain, form, 'fields.json', data)
222 metadata.update({'fields': fields})
223
224 # on charge la base des types MIME une fois pour toutes
225 #magicmime = magic.Magic(mime=True) => ce sera pour plus tard…
226 magicmime = magic.open(magic.MAGIC_MIME)
227 magicmime.load()
228
229 liste_dossiers = []
230 liste_attachements = {}
231 for object in formdef.data_class().select():
232 if object.user is None:
233 logging.warning("Dossier '%s' sans utilisateur associé ?!?"\
234 " On ignore...", object.id)
235 continue
236
e86adf99
P
237 try:
238 workflow_status = object.status.startswith('wf-') and \
239 object.get_workflow_status().name or None
240 except:
241 workflow_status = None
242
20ae1ad6
P
243 result = {
244 'num_dossier': object.id,
245 'wcs_status': object.status,
e86adf99 246 'wcs_workflow_status': workflow_status,
20ae1ad6
P
247 'wcs_user_email': object.user.email,
248 'wcs_user_display_name': object.user.display_name,
249 #'wcs_last_modified': time.strftime('%Y-%m-%d %H:%M:%S', time.gmtime(object.last_modified())),
250 'wcs_comments': [],
251 }
252
253 if object.evolution is not None:
254 for e in object.evolution:
255 if e.comment is not None:
256 who = pub.user_class.get(e.who).display_name
257 e_time = time.strftime('%Y-%m-%d %H:%M:%S', e.time)
258 comment = '%s -- %s %s' % (e.comment, who, e_time)
259 result['wcs_comments'].append(comment)
260
261 qfiles = { }
262 for field in formdef.fields:
263 field_id = str(field.id)
264 if not field_id in object.data:
265 continue
266 if isinstance(field, TitleField) or isinstance(field, CommentField):
267 continue
268 field_name = fields[field_id]['name']
269 data = object.data.get(field_id)
270 if data is None:
271 result[field_name] = None
272 continue
273 if isinstance(field, StringField) or isinstance(field, TextField) \
274 or isinstance(field, EmailField) or isinstance(field, ItemField):
275 result[field_name] = data
276 elif isinstance(field, ItemsField) or isinstance(field, TableField):
277 result[field_name] = data # liste => peux-être joindre sur ';'
278 elif isinstance(field, BoolField):
279 result[field_name] = (data == 'True')
280 elif isinstance(field, DateField):
281 if isinstance(data, time.struct_time):
282 result[field_name] = '%04d-%02d-%02d' % (data.tm_year,
283 data.tm_mon, data.tm_mday)
284 else:
285 result[field_name] = data
286 elif isinstance(field, FileField):
287 if '.' in data.orig_filename:
288 extension = data.orig_filename.rpartition('.')[2].lower()
289 else: # il n'y a pas d'extension dans le nom de fichier
290 p = os.path.join(pub.app_dir, 'uploads', data.qfilename)
291 try:
292 #m = magicmime.from_file(p) => ce sera pour plus tard…
293 m = magicmime.file(p).split()[0].strip(';')
294 extension = mimetypes.guess_extension(m)
295 except:
296 logging.warning("Type de fichier inconnu pour '%s'.", p)
297 extension = None
298 if extension is not None:
299 extension = extension[1:]
300 else:
301 extension = 'unknown'
302 result[field_name] = "%s.%s" % (field_name, extension)
303 qfiles[field_name] = data.qfilename
304 else:
305 logging.warning("Type de champ inconnu '%s' pour '%s' (%s).",
306 field.__class__.__name__, field_name, field.label)
307
308 num_dossier = result['num_dossier']
309 nom = _reduce_to_alnum(result.get('nom','sans-nom')).upper()
310 prenom = _reduce_to_alnum(result.get('prenom','sans-prenom')).upper()
311 adel = result.get('adresse_electronique','sans-adel').replace('@','-').lower()
312
313 filename = "%04d-%s-%s-%s" % (num_dossier, nom, prenom, adel)
314 liste_dossiers.append(filename + '.json')
315
316 # sauvegarde des chemins d'accès aux fichiers joints
317 for f in qfiles:
318 dst = filename + '_' + result[f]
319 src = os.path.join(pub.app_dir, 'uploads', qfiles[f])
320 liste_attachements.update({dst: src})
904cd34f
P
321 # on renomme le fichier joint indiqué dans le dossier
322 result[f] = dst
20ae1ad6
P
323
324 # génération du fichier JSON
325 data = json.dumps(result, ensure_ascii=False).encode('utf-8')
326 set_wcs_cache(domain, form, 'data_%s.json' % filename, data)
327
328 logging.info("Dossier '%s' : %s.",
329 filename, result['wcs_workflow_status'])
330
331 data = json.dumps(liste_attachements, ensure_ascii=False).encode('utf-8')
332 set_wcs_cache(domain, form, 'data-files.json', data)
333 metadata.update({'attachements': liste_attachements})
334
335 liste_dossiers.sort()
336 data = json.dumps(liste_dossiers, ensure_ascii=False).encode('utf-8')
337 set_wcs_cache(domain, form, 'liste-dossiers.json', data)
338 metadata.update({'dossiers': liste_dossiers})
339
340 logging.info('Fin.')
341
342 data = json.dumps(metadata, ensure_ascii=False).encode('utf-8')
343 set_wcs_cache(domain, form, 'metadata.json', data)
344
345#if __name__ == '__main__':
346# try:
347# extract_data(formdef, OUTPUT_DIRECTORY)
348# except:
349# logging.exception("Interruption du traitement pour cause d'erreur !")
350
351#--------------------------------------------------------------------------
352# gestion des requêtes web
353#--------------------------------------------------------------------------
354
355#l = []
356#for k in sorted(os.environ):
357# l.append('%s=%s\n' % (k, os.environ[k]))
358#data = ''.join(l)
359#http_reply_and_exit(data, 'text/plain')
360
361domain = os.environ.get('HTTP_HOST', '')
362if domain not in get_wcs_domains():
363 http_reply_and_exit("Domaine '%s' inconnu." % domain, 'text/plain')
364
365path_info = os.environ.get('PATH_INFO', '')
366
367path_prefix = os.environ.get('REQUEST_URI', '')
368if len(path_info) > 0:
369 path_prefix = path_prefix[:-len(path_info)]
370
371if path_info == '':
372 http_redirect(path_prefix + '/')
373
374if path_info == '/':
375 # liste des formulaires disponibles
376 l = sorted(get_wcs_forms(domain))
377 l = ['<li><a href="%s/">%s</a></li>' % (f, f) for f in l]
378 title = '<p>Liste des formulaires disponibles&nbsp;:</p>\n'
379 data = '<html>\n' + title + '<ul>\n' + '\n'.join(l) + '\n</ul>\n</html>'
380 http_reply_and_exit(data, 'text/html')
381
382if path_info == '/index.json':
383 # liste des formulaires disponibles
384 l = sorted(get_wcs_forms(domain))
385 data = json.dumps(l, ensure_ascii=False, indent=' ').encode('utf-8')
386 http_reply_and_exit(data, 'application/json')
387
388if path_info == '/domains.json':
389 # liste des domaines disponibles
390 l = get_wcs_domains()
391 data = json.dumps(l, ensure_ascii=False, indent=' ').encode('utf-8')
392 http_reply_and_exit(data, 'application/json')
393
394if match(r'^/[a-z0-9-]+$', path_info):
395 http_redirect(path_prefix + path_info + '/')
396
397if match(r'^/[a-z0-9-]+/$', path_info):
398 form = path_info.split('/')[1]
399 if form not in get_wcs_forms(domain):
400 http_reply_and_exit("Formulaire '%s' inconnu." % form, 'text/plain')
401 l = [ 'fields.json', 'field-names.json', 'field-names.txt', 'last-run.log', 'liste-dossiers.json' ]
402 l = ['<li><a href="%s">%s</a></li>' % (f, f) for f in l]
403 title = '<p>Liste des informations disponibles&nbsp;:</p>\n'
404 action1 = """<p><a href="data/">Export des données</a></p>\n"""
405 action2 = """<p><a href="clear-cache">Suppression du cache</a> (pour ré-export)</p>\n"""
406 data = '<html>\n' + title + '<ul>\n' + '\n'.join(l) + '\n</ul>\n' + action1 + action2 + '</html>'
407 http_reply_and_exit(data, 'text/html')
408
409if match(r'^/[a-z0-9-]+/index.json$', path_info):
410 form = path_info.split('/')[1]
411 if form not in get_wcs_forms(domain):
412 http_reply_and_exit("Formulaire '%s' inconnu." % form, 'text/plain')
413 l = [ 'fields.json', 'field-names.json', 'field-names.txt', 'last-run.log', 'liste-dossiers.json', 'data', 'clear-cache' ]
414 data = json.dumps(l, ensure_ascii=False, indent=' ').encode('utf-8')
415 http_reply_and_exit(data, 'application/json')
416
417if match(r'^/[a-z0-9-]+/clear-cache$', path_info):
418 form = path_info.split('/')[1]
419 if form not in get_wcs_forms(domain):
420 http_reply_and_exit("Formulaire '%s' inconnu." % form, 'text/plain')
421 clear_wcs_cache(domain, form)
422 http_reply_and_exit('Ok.', 'text/plain')
423
424if match(r'^/[a-z0-9-]+/fields.json$', path_info):
425 form = path_info.split('/')[1]
426 if form not in get_wcs_forms(domain):
427 http_reply_and_exit("Formulaire '%s' inconnu." % form, 'text/plain')
428 get_wcs_form_data(domain, form)
429 d = json.loads(get_wcs_cache(domain, form, 'fields.json'), encoding='utf-8')
430 data = json.dumps(d, ensure_ascii=False, indent=' ').encode('utf-8')
431 http_reply_and_exit(data, 'application/json')
432
433if match(r'^/[a-z0-9-]+/field-names.json$', path_info):
434 form = path_info.split('/')[1]
435 if form not in get_wcs_forms(domain):
436 http_reply_and_exit("Formulaire '%s' inconnu." % form, 'text/plain')
437 get_wcs_form_data(domain, form)
438 d = json.loads(get_wcs_cache(domain, form, 'fields.json'), encoding='utf-8')
439 d = dict([(k, d[k]['name']) for k in d])
440 data = json.dumps(d, ensure_ascii=False, indent=' ').encode('utf-8')
441 http_reply_and_exit(data, 'application/json')
442
443if match(r'^/[a-z0-9-]+/field-names.txt$', path_info):
444 form = path_info.split('/')[1]
445 if form not in get_wcs_forms(domain):
446 http_reply_and_exit("Formulaire '%s' inconnu." % form, 'text/plain')
447 get_wcs_form_data(domain, form)
448 d = json.loads(get_wcs_cache(domain, form, 'fields.json'), encoding='utf-8')
449 d = [(k, d[k]['name'], d[k]['label']) for k in d]
450 d = sorted(d, key=lambda x: int(x[0]))
451 text = u''.join([u'%s:%s:%s\n' % (x[0], x[1], x[2]) for x in d])
452 data = text.encode('utf-8')
453 http_reply_and_exit(data, 'text/plain')
454
455if match(r'^/[a-z0-9-]+/last-run.log$', path_info):
456 form = path_info.split('/')[1]
457 if form not in get_wcs_forms(domain):
458 http_reply_and_exit("Formulaire '%s' inconnu." % form, 'text/plain')
459 get_wcs_form_data(domain, form)
460 data = get_wcs_cache(domain, form, 'last-run.log')
461 http_reply_and_exit(data, 'text/plain')
462
463if match(r'^/[a-z0-9-]+/liste-dossiers.json$', path_info):
464 form = path_info.split('/')[1]
465 if form not in get_wcs_forms(domain):
466 http_reply_and_exit("Formulaire '%s' inconnu." % form, 'text/plain')
467 get_wcs_form_data(domain, form)
468 data = json.loads(get_wcs_cache(domain, form, 'liste-dossiers.json'), encoding='utf-8')
469 data = json.dumps(data, ensure_ascii=False, indent=' ').encode('utf-8')
470 http_reply_and_exit(data, 'application/json')
471
472if match(r'^/[a-z0-9-]+/data$', path_info):
473 http_redirect(path_prefix + path_info + '/')
474
475if match(r'^/[a-z0-9-]+/data/$', path_info):
476 form = path_info.split('/')[1]
477 if form not in get_wcs_forms(domain):
478 http_reply_and_exit("Formulaire '%s' inconnu." % form, 'text/plain')
479 get_wcs_form_data(domain, form)
480 dossiers = json.loads(get_wcs_cache(domain, form, 'liste-dossiers.json'), encoding='utf-8')
481 attachements = json.loads(get_wcs_cache(domain, form, 'data-files.json'), encoding='utf-8')
482 l = sorted(dossiers + attachements.keys())
483 if len(l) > 0:
484 l = ['<li><a href="%s">%s</a></li>' % (f, f) for f in l]
485 title = '<p>Liste des documents disponibles&nbsp;:</p>\n'
486 data = '<html>\n' + title + '<ul>\n' + '\n'.join(l) + '\n</ul>\n</html>'
487 else:
488 data = '<html>\n<p>Aucun document disponible.</p>\n</html>'
489 http_reply_and_exit(data, 'text/html')
490
491if match(r'^/[a-z0-9-]+/data/index.json$', path_info):
492 form = path_info.split('/')[1]
493 if form not in get_wcs_forms(domain):
494 http_reply_and_exit("Formulaire '%s' inconnu." % form, 'text/plain')
495 get_wcs_form_data(domain, form)
496 dossiers = json.loads(get_wcs_cache(domain, form, 'liste-dossiers.json'), encoding='utf-8')
497 attachements = json.loads(get_wcs_cache(domain, form, 'data-files.json'), encoding='utf-8')
498 l = sorted(dossiers + attachements.keys())
499 data = json.dumps(l, ensure_ascii=False, indent=' ').encode('utf-8')
500 http_reply_and_exit(data, 'application/json')
501
502if match(r'^/[a-z0-9-]+/data/[^/]+$', path_info):
503 form = path_info.split('/')[1]
504 if form not in get_wcs_forms(domain):
505 http_reply_and_exit("Formulaire '%s' inconnu." % form, 'text/plain')
506 get_wcs_form_data(domain, form)
507 doc = path_info.split('/')[3]
508 dossiers = json.loads(get_wcs_cache(domain, form, 'liste-dossiers.json'), encoding='utf-8')
509 if doc in dossiers:
510 data = get_wcs_cache(domain, form, 'data_' + doc)
511 data = json.loads(data, encoding='utf-8')
512 data = json.dumps(data, ensure_ascii=False, indent=' ').encode('utf-8')
513 http_reply_and_exit(data, 'application/json')
514 attachements = json.loads(get_wcs_cache(domain, form, 'data-files.json'), encoding='utf-8')
515 if doc in attachements:
516 data = open(attachements[doc], 'rb').read()
517 mime_type = mimetypes.guess_type(doc)[0]
518 if mime_type is None:
519 mime_type = 'application/octet-stream'
520 http_reply_and_exit(data, mime_type)
521 http_reply_and_exit("Document '%s' inconnu." % path_info, 'text/plain')
522
523http_reply_and_exit("Requête '%s' inconnue." % path_info, 'text/plain')