Commit | Line | Data |
---|---|---|
0f48356a P |
1 | #!/usr/bin/env python |
2 | # -*- coding: utf-8 -*- | |
edf3eba0 P |
3 | """ |
4 | Outil d'export de données w.c.s. | |
0f48356a | 5 | |
edf3eba0 P |
6 | Copyright : Agence universitaire de la Francophonie — www.auf.org |
7 | Licence : GNU General Public Licence, version 2 | |
8 | Auteur : Jean Christophe André | |
9 | Date de création : 15 octobre 2009 | |
10 | ||
11 | Depends: wcs, python-simplejson, python-magic | |
12 | """ | |
0f48356a P |
13 | import os |
14 | import os.path | |
a9de62a1 P |
15 | import shutil |
16 | import logging | |
abdb4a8c | 17 | from time import gmtime, strftime, struct_time |
0f48356a | 18 | import simplejson as json |
edf3eba0 P |
19 | import magic |
20 | import mimetypes | |
abdb4a8c | 21 | import unicodedata |
0f48356a | 22 | |
0cd61ae9 P |
23 | from wcs import publisher |
24 | from wcs.formdef import FormDef | |
25 | from wcs.fields import TitleField, CommentField, TextField, \ | |
ee4d6a09 | 26 | StringField, ItemField, ItemsField, EmailField, \ |
d3a019f6 | 27 | DateField, FileField, BoolField, TableField |
0f48356a | 28 | |
0f48356a | 29 | |
0cd61ae9 P |
30 | def reduce_to_alnum(s, replacement_char='-'): |
31 | """réduction d'une chaîne de caractères à de l'alpha-numérique""" | |
32 | ||
0f48356a P |
33 | if type(s) is not unicode: |
34 | s = unicode(s, 'utf-8') | |
abdb4a8c | 35 | s = unicodedata.normalize('NFKD', s).encode('ASCII', 'ignore') |
0f48356a P |
36 | r = '' |
37 | for c in s: | |
abdb4a8c | 38 | if ('a' <= c.lower() <= 'z') or ('0' <= c <= '9'): |
0f48356a P |
39 | r += c |
40 | elif len(r) > 0 and r[-1] != replacement_char: | |
41 | r += replacement_char | |
42 | else: # r == '' or r[-1] == replacement_char | |
43 | pass | |
abdb4a8c | 44 | return r.strip(replacement_char) |
0f48356a | 45 | |
0f48356a | 46 | |
0cd61ae9 P |
47 | def extract_fields(formdef, output_directory): |
48 | """nommage des champs de façon unique""" | |
49 | # TODO: devrait retourner un résultat, qui serait alors sauvé en dehors | |
50 | ||
36319452 P |
51 | # XXX: hack temporaire… :-/ |
52 | global field_names | |
53 | ||
0cd61ae9 P |
54 | f = open(os.path.join(output_directory, 'field-names.txt'), 'w') |
55 | ||
56 | field_names = {} | |
57 | field_names_duplicates = {} | |
0f48356a | 58 | for field in formdef.fields: |
0f48356a P |
59 | if isinstance(field, TitleField) or isinstance(field, CommentField): |
60 | continue | |
fa3aa24e P |
61 | if field.varname: |
62 | name = field.varname | |
63 | else: | |
64 | name = reduce_to_alnum(field.label,'_').lower() | |
0cd61ae9 P |
65 | if name in field_names.values(): # duplicat |
66 | field_names_duplicates[name] = field_names_duplicates.get(name, 1) + 1 | |
67 | name = '%s_%d' % (name, field_names_duplicates[name]) | |
68 | field_names.update({field.id: name}) | |
69 | print >>f, "%s:%s:%s" % (field.id, field_names[field.id], field.label) | |
70 | ||
0f48356a P |
71 | f.close() |
72 | ||
0cd61ae9 | 73 | f = open(os.path.join(output_directory, 'field-names.json'), 'wb') |
47d9d835 | 74 | f.write(json.dumps(field_names, ensure_ascii=False)) |
0cd61ae9 P |
75 | f.close() |
76 | ||
77 | ||
78 | def extract_data(formdef, output_directory): | |
79 | """extraction des données du formulaire""" | |
80 | # TODO: devrait retourner un résultat, qui serait alors sauvé en dehors | |
81 | ||
e80345a7 P |
82 | # XXX: hack temporaire… :-/ |
83 | global pub | |
84 | ||
edf3eba0 P |
85 | # on charge la base des types MIME une fois pour toutes |
86 | #magicmime = magic.Magic(mime=True) => ce sera pour plus tard… | |
6244020f | 87 | magicmime = magic.open(magic.MAGIC_MIME) |
dffcfa61 | 88 | magicmime.load() |
edf3eba0 | 89 | |
2e14236b | 90 | liste_dossiers = [] |
0cd61ae9 | 91 | for object in formdef.data_class().select(): |
9d424de0 P |
92 | if object.user is None: |
93 | logging.warning("Dossier '%s' sans utilisateur associé ?!?"\ | |
94 | " On ignore...", object.id) | |
95 | continue | |
96 | ||
e4752013 P |
97 | try: |
98 | workflow_status = object.status.startswith('wf-') and \ | |
99 | object.get_workflow_status().name or None | |
100 | except: | |
101 | workflow_status = None | |
102 | ||
0cd61ae9 P |
103 | result = { |
104 | 'num_dossier': object.id, | |
105 | 'wcs_status': object.status, | |
e4752013 | 106 | 'wcs_workflow_status': workflow_status, |
0cd61ae9 P |
107 | 'wcs_user_email': object.user.email, |
108 | 'wcs_user_display_name': object.user.display_name, | |
109 | #'wcs_last_modified': strftime('%Y-%m-%d %H:%M:%S', gmtime(object.last_modified())), | |
e80345a7 | 110 | 'wcs_comments': [], |
0cd61ae9 | 111 | } |
e80345a7 P |
112 | |
113 | if object.evolution is not None: | |
114 | for e in object.evolution: | |
115 | if e.comment is not None: | |
da0c36c5 P |
116 | try: |
117 | who = pub.user_class.get(e.who).display_name | |
118 | except: | |
119 | who = 'Inconnu(e)' | |
e80345a7 P |
120 | time = strftime('%Y-%m-%d %H:%M:%S', e.time) |
121 | comment = '%s -- %s %s' % (e.comment, who, time) | |
122 | result['wcs_comments'].append(comment) | |
123 | ||
0cd61ae9 P |
124 | qfiles = { } |
125 | for field in formdef.fields: | |
126 | field_id = str(field.id) | |
127 | if not field_id in object.data: | |
128 | continue | |
129 | if isinstance(field, TitleField) or isinstance(field, CommentField): | |
130 | continue | |
131 | field_name = field_names[field_id] | |
132 | data = object.data.get(field_id) | |
ee4d6a09 P |
133 | if data is None: |
134 | result[field_name] = None | |
135 | continue | |
0cd61ae9 P |
136 | if isinstance(field, StringField) or isinstance(field, TextField) \ |
137 | or isinstance(field, EmailField) or isinstance(field, ItemField): | |
138 | result[field_name] = data | |
d3a019f6 | 139 | elif isinstance(field, ItemsField) or isinstance(field, TableField): |
ee4d6a09 | 140 | result[field_name] = data # liste => peux-être joindre sur ';' |
0cd61ae9 P |
141 | elif isinstance(field, BoolField): |
142 | result[field_name] = (data == 'True') | |
143 | elif isinstance(field, DateField): | |
abdb4a8c | 144 | if isinstance(data, struct_time): |
e2f76bef | 145 | result[field_name] = '%04d-%02d-%02d' % (data.tm_year, |
b182779a | 146 | data.tm_mon, data.tm_mday) |
e2f76bef P |
147 | else: |
148 | result[field_name] = data | |
791f9873 | 149 | elif isinstance(field, FileField): |
edf3eba0 P |
150 | if '.' in data.orig_filename: |
151 | extension = data.orig_filename.rpartition('.')[2].lower() | |
152 | else: # il n'y a pas d'extension dans le nom de fichier | |
153 | p = os.path.join(pub.app_dir, 'uploads', data.qfilename) | |
474e5135 P |
154 | try: |
155 | #m = magicmime.from_file(p) => ce sera pour plus tard… | |
156 | m = magicmime.file(p).split()[0].strip(';') | |
157 | extension = mimetypes.guess_extension(m) | |
158 | except: | |
159 | logging.warning("Type de fichier inconnu pour '%s'.", p) | |
160 | extension = None | |
edf3eba0 P |
161 | if extension is not None: |
162 | extension = extension[1:] | |
163 | else: | |
164 | extension = 'unknown' | |
0cd61ae9 P |
165 | result[field_name] = "%s.%s" % (field_name, extension) |
166 | qfiles[field_name] = data.qfilename | |
167 | else: | |
d3a019f6 P |
168 | logging.warning("Type de champ inconnu '%s' pour '%s' (%s).", |
169 | field.__class__.__name__, field_name, field.label) | |
0cd61ae9 P |
170 | |
171 | num_dossier = result['num_dossier'] | |
fa3aa24e P |
172 | nom = reduce_to_alnum(result.get('nom','sans-nom')).upper() |
173 | prenom = reduce_to_alnum(result.get('prenom','sans-prenom')).upper() | |
174 | adel = result.get('adresse_electronique','sans-adel').replace('@','-').lower() | |
0cd61ae9 P |
175 | |
176 | filename = "%04d-%s-%s-%s" % (num_dossier, nom, prenom, adel) | |
2e14236b | 177 | liste_dossiers.append(filename + '.json') |
0cd61ae9 | 178 | |
e4752013 P |
179 | # création du sous-dossier destination, au besoin |
180 | dstdir = os.path.join(output_directory, 'data', result['wcs_status']) | |
181 | if not os.path.isdir(dstdir): | |
182 | os.mkdir(dstdir) | |
183 | ||
0cd61ae9 P |
184 | # copie des fichiers joints |
185 | for f in qfiles: | |
186 | result[f] = filename + '_' + result[f] | |
187 | src = os.path.join(pub.app_dir, 'uploads', qfiles[f]) | |
e4752013 | 188 | dst = os.path.join(dstdir, result[f]) |
0cd61ae9 | 189 | if not os.path.exists(dst) or os.path.getmtime(src) > os.path.getmtime(dst): |
a9de62a1 | 190 | shutil.copy2(src, dst) |
0cd61ae9 P |
191 | os.chmod(dst, 0644) |
192 | ||
193 | # génération du fichier JSON | |
e4752013 | 194 | jsonname = os.path.join(dstdir, filename + '.json') |
a9de62a1 | 195 | f = open(jsonname, 'wb') |
fa3aa24e | 196 | f.write(json.dumps(result, ensure_ascii=False).encode('utf-8')) |
0cd61ae9 P |
197 | f.close() |
198 | ||
a9de62a1 P |
199 | logging.info("Dossier '%s' : %s.", |
200 | filename, result['wcs_workflow_status']) | |
0cd61ae9 | 201 | |
b35749bb | 202 | liste_dossiers.sort() |
2e14236b | 203 | f = open(os.path.join(output_directory, 'liste-dossiers.json'), 'wb') |
b35749bb | 204 | f.write(json.dumps(liste_dossiers, ensure_ascii=False)) |
2e14236b P |
205 | f.close() |
206 | ||
0cd61ae9 P |
207 | |
208 | if __name__ == '__main__': | |
209 | import sys | |
210 | ||
211 | if len(sys.argv) != 4: | |
212 | print >>sys.stderr, "Usage : %s <dossier-destination> <site> <formulaire>" % sys.argv[0] | |
213 | sys.exit(1) | |
214 | ||
215 | VHOST = sys.argv[2] | |
216 | FORM_NAME = sys.argv[3] | |
217 | OUTPUT_DIRECTORY = os.path.join(sys.argv[1], VHOST, FORM_NAME) | |
218 | ||
219 | os.umask(0022) | |
220 | # création du dossier d'extraction, au besoin | |
9d424de0 P |
221 | if not os.path.isdir(os.path.join(OUTPUT_DIRECTORY, 'data')): |
222 | os.makedirs(os.path.join(OUTPUT_DIRECTORY, 'data'), 0755) | |
0cd61ae9 | 223 | |
a9de62a1 P |
224 | logging.basicConfig(level=logging.DEBUG, |
225 | format='%(asctime)s %(levelname)s %(message)s', | |
226 | filename=os.path.join(OUTPUT_DIRECTORY, 'last-run.log'), | |
227 | filemode='w') | |
228 | ||
229 | logging.info('Début.') | |
230 | ||
0cd61ae9 P |
231 | pub = publisher.WcsPublisher.create_publisher() |
232 | pub.app_dir = os.path.join(pub.app_dir, VHOST) | |
da0c36c5 | 233 | pub.set_config() |
0cd61ae9 P |
234 | |
235 | formdef = FormDef.get_by_urlname(FORM_NAME) | |
236 | ||
237 | extract_fields(formdef, OUTPUT_DIRECTORY) | |
238 | ||
b182779a P |
239 | try: |
240 | extract_data(formdef, OUTPUT_DIRECTORY) | |
241 | except: | |
242 | logging.exception("Interruption du traitement pour cause d'erreur !") | |
0cd61ae9 | 243 | |
a9de62a1 | 244 | logging.info('Fin.') |
0f48356a | 245 |