list2form : gestion des erreurs et des petits fichiers (merci cgi.py…)
[progfou.git] / wcs / wcs-extract
CommitLineData
0f48356a
P
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
edf3eba0
P
3"""
4Outil d'export de données w.c.s.
0f48356a 5
edf3eba0
P
6Copyright : Agence universitaire de la Francophonie — www.auf.org
7Licence : GNU General Public Licence, version 2
8Auteur : Jean Christophe André
9Date de création : 15 octobre 2009
10
11Depends: wcs, python-simplejson, python-magic
12"""
0f48356a
P
13import os
14import os.path
a9de62a1
P
15import shutil
16import logging
abdb4a8c 17from time import gmtime, strftime, struct_time
0f48356a 18import simplejson as json
edf3eba0
P
19import magic
20import mimetypes
abdb4a8c 21import unicodedata
0f48356a 22
0cd61ae9
P
23from wcs import publisher
24from wcs.formdef import FormDef
25from wcs.fields import TitleField, CommentField, TextField, \
ee4d6a09 26 StringField, ItemField, ItemsField, EmailField, \
d3a019f6 27 DateField, FileField, BoolField, TableField
0f48356a 28
0f48356a 29
0cd61ae9
P
30def reduce_to_alnum(s, replacement_char='-'):
31 """réduction d'une chaîne de caractères à de l'alpha-numérique"""
32
0f48356a
P
33 if type(s) is not unicode:
34 s = unicode(s, 'utf-8')
abdb4a8c 35 s = unicodedata.normalize('NFKD', s).encode('ASCII', 'ignore')
0f48356a
P
36 r = ''
37 for c in s:
abdb4a8c 38 if ('a' <= c.lower() <= 'z') or ('0' <= c <= '9'):
0f48356a
P
39 r += c
40 elif len(r) > 0 and r[-1] != replacement_char:
41 r += replacement_char
42 else: # r == '' or r[-1] == replacement_char
43 pass
abdb4a8c 44 return r.strip(replacement_char)
0f48356a 45
0f48356a 46
0cd61ae9
P
47def extract_fields(formdef, output_directory):
48 """nommage des champs de façon unique"""
49 # TODO: devrait retourner un résultat, qui serait alors sauvé en dehors
50
36319452
P
51 # XXX: hack temporaire… :-/
52 global field_names
53
0cd61ae9
P
54 f = open(os.path.join(output_directory, 'field-names.txt'), 'w')
55
56 field_names = {}
57 field_names_duplicates = {}
0f48356a 58 for field in formdef.fields:
0f48356a
P
59 if isinstance(field, TitleField) or isinstance(field, CommentField):
60 continue
fa3aa24e
P
61 if field.varname:
62 name = field.varname
63 else:
64 name = reduce_to_alnum(field.label,'_').lower()
0cd61ae9
P
65 if name in field_names.values(): # duplicat
66 field_names_duplicates[name] = field_names_duplicates.get(name, 1) + 1
67 name = '%s_%d' % (name, field_names_duplicates[name])
68 field_names.update({field.id: name})
69 print >>f, "%s:%s:%s" % (field.id, field_names[field.id], field.label)
70
0f48356a
P
71 f.close()
72
0cd61ae9 73 f = open(os.path.join(output_directory, 'field-names.json'), 'wb')
47d9d835 74 f.write(json.dumps(field_names, ensure_ascii=False))
0cd61ae9
P
75 f.close()
76
77
78def extract_data(formdef, output_directory):
79 """extraction des données du formulaire"""
80 # TODO: devrait retourner un résultat, qui serait alors sauvé en dehors
81
e80345a7
P
82 # XXX: hack temporaire… :-/
83 global pub
84
edf3eba0
P
85 # on charge la base des types MIME une fois pour toutes
86 #magicmime = magic.Magic(mime=True) => ce sera pour plus tard…
6244020f 87 magicmime = magic.open(magic.MAGIC_MIME)
dffcfa61 88 magicmime.load()
edf3eba0 89
2e14236b 90 liste_dossiers = []
0cd61ae9 91 for object in formdef.data_class().select():
9d424de0
P
92 if object.user is None:
93 logging.warning("Dossier '%s' sans utilisateur associé ?!?"\
94 " On ignore...", object.id)
95 continue
96
e4752013
P
97 try:
98 workflow_status = object.status.startswith('wf-') and \
99 object.get_workflow_status().name or None
100 except:
101 workflow_status = None
102
0cd61ae9
P
103 result = {
104 'num_dossier': object.id,
105 'wcs_status': object.status,
e4752013 106 'wcs_workflow_status': workflow_status,
0cd61ae9
P
107 'wcs_user_email': object.user.email,
108 'wcs_user_display_name': object.user.display_name,
109 #'wcs_last_modified': strftime('%Y-%m-%d %H:%M:%S', gmtime(object.last_modified())),
e80345a7 110 'wcs_comments': [],
0cd61ae9 111 }
e80345a7
P
112
113 if object.evolution is not None:
114 for e in object.evolution:
115 if e.comment is not None:
da0c36c5
P
116 try:
117 who = pub.user_class.get(e.who).display_name
118 except:
119 who = 'Inconnu(e)'
e80345a7
P
120 time = strftime('%Y-%m-%d %H:%M:%S', e.time)
121 comment = '%s -- %s %s' % (e.comment, who, time)
122 result['wcs_comments'].append(comment)
123
0cd61ae9
P
124 qfiles = { }
125 for field in formdef.fields:
126 field_id = str(field.id)
127 if not field_id in object.data:
128 continue
129 if isinstance(field, TitleField) or isinstance(field, CommentField):
130 continue
131 field_name = field_names[field_id]
132 data = object.data.get(field_id)
ee4d6a09
P
133 if data is None:
134 result[field_name] = None
135 continue
0cd61ae9
P
136 if isinstance(field, StringField) or isinstance(field, TextField) \
137 or isinstance(field, EmailField) or isinstance(field, ItemField):
138 result[field_name] = data
d3a019f6 139 elif isinstance(field, ItemsField) or isinstance(field, TableField):
ee4d6a09 140 result[field_name] = data # liste => peux-être joindre sur ';'
0cd61ae9
P
141 elif isinstance(field, BoolField):
142 result[field_name] = (data == 'True')
143 elif isinstance(field, DateField):
abdb4a8c 144 if isinstance(data, struct_time):
e2f76bef 145 result[field_name] = '%04d-%02d-%02d' % (data.tm_year,
b182779a 146 data.tm_mon, data.tm_mday)
e2f76bef
P
147 else:
148 result[field_name] = data
791f9873 149 elif isinstance(field, FileField):
edf3eba0
P
150 if '.' in data.orig_filename:
151 extension = data.orig_filename.rpartition('.')[2].lower()
152 else: # il n'y a pas d'extension dans le nom de fichier
153 p = os.path.join(pub.app_dir, 'uploads', data.qfilename)
474e5135
P
154 try:
155 #m = magicmime.from_file(p) => ce sera pour plus tard…
156 m = magicmime.file(p).split()[0].strip(';')
157 extension = mimetypes.guess_extension(m)
158 except:
159 logging.warning("Type de fichier inconnu pour '%s'.", p)
160 extension = None
edf3eba0
P
161 if extension is not None:
162 extension = extension[1:]
163 else:
164 extension = 'unknown'
0cd61ae9
P
165 result[field_name] = "%s.%s" % (field_name, extension)
166 qfiles[field_name] = data.qfilename
167 else:
d3a019f6
P
168 logging.warning("Type de champ inconnu '%s' pour '%s' (%s).",
169 field.__class__.__name__, field_name, field.label)
0cd61ae9
P
170
171 num_dossier = result['num_dossier']
fa3aa24e
P
172 nom = reduce_to_alnum(result.get('nom','sans-nom')).upper()
173 prenom = reduce_to_alnum(result.get('prenom','sans-prenom')).upper()
174 adel = result.get('adresse_electronique','sans-adel').replace('@','-').lower()
0cd61ae9
P
175
176 filename = "%04d-%s-%s-%s" % (num_dossier, nom, prenom, adel)
2e14236b 177 liste_dossiers.append(filename + '.json')
0cd61ae9 178
e4752013
P
179 # création du sous-dossier destination, au besoin
180 dstdir = os.path.join(output_directory, 'data', result['wcs_status'])
181 if not os.path.isdir(dstdir):
182 os.mkdir(dstdir)
183
0cd61ae9
P
184 # copie des fichiers joints
185 for f in qfiles:
186 result[f] = filename + '_' + result[f]
187 src = os.path.join(pub.app_dir, 'uploads', qfiles[f])
e4752013 188 dst = os.path.join(dstdir, result[f])
0cd61ae9 189 if not os.path.exists(dst) or os.path.getmtime(src) > os.path.getmtime(dst):
a9de62a1 190 shutil.copy2(src, dst)
0cd61ae9
P
191 os.chmod(dst, 0644)
192
193 # génération du fichier JSON
e4752013 194 jsonname = os.path.join(dstdir, filename + '.json')
a9de62a1 195 f = open(jsonname, 'wb')
fa3aa24e 196 f.write(json.dumps(result, ensure_ascii=False).encode('utf-8'))
0cd61ae9
P
197 f.close()
198
a9de62a1
P
199 logging.info("Dossier '%s' : %s.",
200 filename, result['wcs_workflow_status'])
0cd61ae9 201
b35749bb 202 liste_dossiers.sort()
2e14236b 203 f = open(os.path.join(output_directory, 'liste-dossiers.json'), 'wb')
b35749bb 204 f.write(json.dumps(liste_dossiers, ensure_ascii=False))
2e14236b
P
205 f.close()
206
0cd61ae9
P
207
208if __name__ == '__main__':
209 import sys
210
211 if len(sys.argv) != 4:
212 print >>sys.stderr, "Usage : %s <dossier-destination> <site> <formulaire>" % sys.argv[0]
213 sys.exit(1)
214
215 VHOST = sys.argv[2]
216 FORM_NAME = sys.argv[3]
217 OUTPUT_DIRECTORY = os.path.join(sys.argv[1], VHOST, FORM_NAME)
218
219 os.umask(0022)
220 # création du dossier d'extraction, au besoin
9d424de0
P
221 if not os.path.isdir(os.path.join(OUTPUT_DIRECTORY, 'data')):
222 os.makedirs(os.path.join(OUTPUT_DIRECTORY, 'data'), 0755)
0cd61ae9 223
a9de62a1
P
224 logging.basicConfig(level=logging.DEBUG,
225 format='%(asctime)s %(levelname)s %(message)s',
226 filename=os.path.join(OUTPUT_DIRECTORY, 'last-run.log'),
227 filemode='w')
228
229 logging.info('Début.')
230
0cd61ae9
P
231 pub = publisher.WcsPublisher.create_publisher()
232 pub.app_dir = os.path.join(pub.app_dir, VHOST)
da0c36c5 233 pub.set_config()
0cd61ae9
P
234
235 formdef = FormDef.get_by_urlname(FORM_NAME)
236
237 extract_fields(formdef, OUTPUT_DIRECTORY)
238
b182779a
P
239 try:
240 extract_data(formdef, OUTPUT_DIRECTORY)
241 except:
242 logging.exception("Interruption du traitement pour cause d'erreur !")
0cd61ae9 243
a9de62a1 244 logging.info('Fin.')
0f48356a 245