w.c.s. : on trie correctement cette fois…
[progfou.git] / wcs / wcs-extract
CommitLineData
0f48356a
P
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
edf3eba0
P
3"""
4Outil d'export de données w.c.s.
0f48356a 5
edf3eba0
P
6Copyright : Agence universitaire de la Francophonie — www.auf.org
7Licence : GNU General Public Licence, version 2
8Auteur : Jean Christophe André
9Date de création : 15 octobre 2009
10
11Depends: wcs, python-simplejson, python-magic
12"""
0f48356a
P
13import os
14import os.path
a9de62a1
P
15import shutil
16import logging
0f48356a
P
17from time import gmtime, strftime
18import simplejson as json
edf3eba0
P
19import magic
20import mimetypes
0f48356a 21
0cd61ae9
P
22from wcs import publisher
23from wcs.formdef import FormDef
24from wcs.fields import TitleField, CommentField, TextField, \
25 StringField, ItemField, EmailField, \
26 DateField, FileField, BoolField
0f48356a 27
0f48356a 28
0cd61ae9
P
29def reduce_to_alnum(s, replacement_char='-'):
30 """réduction d'une chaîne de caractères à de l'alpha-numérique"""
31
0f48356a
P
32 avec_accent = u'çÇáàâÁÀÂéèêëÉÈÊËíìîïÍÌÎÏóòôöÓÒÔÖúùûüÚÙÛÜýỳyÿÝỲYŸ'
33 sans_accent = u'cCaaaAAAeeeeEEEEiiiiIIIIooooOOOOuuuuUUUUyyyyYYYY'
34 if type(s) is not unicode:
35 s = unicode(s, 'utf-8')
36 u = False
37 r = ''
38 for c in s:
39 index = avec_accent.find(c)
40 if index >= 0:
41 r += sans_accent[index]
42 elif ('a' <= c.lower() <= 'z') or ('0' <= c <= '9'):
43 r += c
44 elif len(r) > 0 and r[-1] != replacement_char:
45 r += replacement_char
46 else: # r == '' or r[-1] == replacement_char
47 pass
48 r = r.strip(replacement_char)
49 if not u:
50 r = r.encode('utf-8')
51 return r
52
0f48356a 53
0cd61ae9
P
54def extract_fields(formdef, output_directory):
55 """nommage des champs de façon unique"""
56 # TODO: devrait retourner un résultat, qui serait alors sauvé en dehors
57
36319452
P
58 # XXX: hack temporaire… :-/
59 global field_names
60
0cd61ae9
P
61 f = open(os.path.join(output_directory, 'field-names.txt'), 'w')
62
63 field_names = {}
64 field_names_duplicates = {}
0f48356a 65 for field in formdef.fields:
0f48356a
P
66 if isinstance(field, TitleField) or isinstance(field, CommentField):
67 continue
0cd61ae9
P
68 name = reduce_to_alnum(field.label,'_').lower()
69 if name in field_names.values(): # duplicat
70 field_names_duplicates[name] = field_names_duplicates.get(name, 1) + 1
71 name = '%s_%d' % (name, field_names_duplicates[name])
72 field_names.update({field.id: name})
73 print >>f, "%s:%s:%s" % (field.id, field_names[field.id], field.label)
74
0f48356a
P
75 f.close()
76
b35749bb
P
77 try:
78 field_names = dict([(int(k),v) for k,v in field_names.items()])
79 except:
80 pass
0cd61ae9 81 f = open(os.path.join(output_directory, 'field-names.json'), 'wb')
edf3eba0 82 f.write(json.dumps(field_names, ensure_ascii=False, sort_keys=True))
0cd61ae9
P
83 f.close()
84
85
86def extract_data(formdef, output_directory):
87 """extraction des données du formulaire"""
88 # TODO: devrait retourner un résultat, qui serait alors sauvé en dehors
89
e80345a7
P
90 # XXX: hack temporaire… :-/
91 global pub
92
edf3eba0
P
93 # on charge la base des types MIME une fois pour toutes
94 #magicmime = magic.Magic(mime=True) => ce sera pour plus tard…
6244020f 95 magicmime = magic.open(magic.MAGIC_MIME)
dffcfa61 96 magicmime.load()
edf3eba0 97
2e14236b 98 liste_dossiers = []
0cd61ae9 99 for object in formdef.data_class().select():
9d424de0
P
100 if object.user is None:
101 logging.warning("Dossier '%s' sans utilisateur associé ?!?"\
102 " On ignore...", object.id)
103 continue
104
0cd61ae9
P
105 result = {
106 'num_dossier': object.id,
107 'wcs_status': object.status,
108 'wcs_workflow_status': object.get_workflow_status().name,
109 'wcs_user_email': object.user.email,
110 'wcs_user_display_name': object.user.display_name,
111 #'wcs_last_modified': strftime('%Y-%m-%d %H:%M:%S', gmtime(object.last_modified())),
e80345a7 112 'wcs_comments': [],
0cd61ae9 113 }
e80345a7
P
114
115 if object.evolution is not None:
116 for e in object.evolution:
117 if e.comment is not None:
118 who = pub.user_class.get(e.who).display_name
119 time = strftime('%Y-%m-%d %H:%M:%S', e.time)
120 comment = '%s -- %s %s' % (e.comment, who, time)
121 result['wcs_comments'].append(comment)
122
0cd61ae9
P
123 qfiles = { }
124 for field in formdef.fields:
125 field_id = str(field.id)
126 if not field_id in object.data:
127 continue
128 if isinstance(field, TitleField) or isinstance(field, CommentField):
129 continue
130 field_name = field_names[field_id]
131 data = object.data.get(field_id)
132 if isinstance(field, StringField) or isinstance(field, TextField) \
133 or isinstance(field, EmailField) or isinstance(field, ItemField):
134 result[field_name] = data
135 elif isinstance(field, BoolField):
136 result[field_name] = (data == 'True')
137 elif isinstance(field, DateField):
138 result[field_name] = strftime('%Y-%m-%d', data)
139 elif isinstance(field, FileField):
edf3eba0
P
140 if '.' in data.orig_filename:
141 extension = data.orig_filename.rpartition('.')[2].lower()
142 else: # il n'y a pas d'extension dans le nom de fichier
143 p = os.path.join(pub.app_dir, 'uploads', data.qfilename)
474e5135
P
144 try:
145 #m = magicmime.from_file(p) => ce sera pour plus tard…
146 m = magicmime.file(p).split()[0].strip(';')
147 extension = mimetypes.guess_extension(m)
148 except:
149 logging.warning("Type de fichier inconnu pour '%s'.", p)
150 extension = None
edf3eba0
P
151 if extension is not None:
152 extension = extension[1:]
153 else:
154 extension = 'unknown'
0cd61ae9
P
155 result[field_name] = "%s.%s" % (field_name, extension)
156 qfiles[field_name] = data.qfilename
157 else:
a9de62a1
P
158 logging.error("Type de champ inconnu '%s' pour '%s'.",
159 field.__class__.__name__, field.label)
0cd61ae9
P
160 raise RuntimeError
161
162 num_dossier = result['num_dossier']
163 nom = reduce_to_alnum(result['nom']).upper()
164 prenom = reduce_to_alnum(result['prenom']).upper()
165 adel = result['adresse_electronique'].replace('@','-').lower()
166
167 filename = "%04d-%s-%s-%s" % (num_dossier, nom, prenom, adel)
2e14236b 168 liste_dossiers.append(filename + '.json')
0cd61ae9 169
0cd61ae9
P
170 # copie des fichiers joints
171 for f in qfiles:
172 result[f] = filename + '_' + result[f]
173 src = os.path.join(pub.app_dir, 'uploads', qfiles[f])
a9de62a1 174 dst = os.path.join(output_directory, 'data', result[f])
0cd61ae9 175 if not os.path.exists(dst) or os.path.getmtime(src) > os.path.getmtime(dst):
a9de62a1 176 shutil.copy2(src, dst)
0cd61ae9
P
177 os.chmod(dst, 0644)
178
179 # génération du fichier JSON
a9de62a1
P
180 jsonname = os.path.join(output_directory, 'data', filename + '.json')
181 f = open(jsonname, 'wb')
edf3eba0 182 f.write(json.dumps(result, ensure_ascii=False, sort_keys=True))
0cd61ae9
P
183 f.close()
184
a9de62a1
P
185 logging.info("Dossier '%s' : %s.",
186 filename, result['wcs_workflow_status'])
0cd61ae9 187
b35749bb 188 liste_dossiers.sort()
2e14236b 189 f = open(os.path.join(output_directory, 'liste-dossiers.json'), 'wb')
b35749bb 190 f.write(json.dumps(liste_dossiers, ensure_ascii=False))
2e14236b
P
191 f.close()
192
0cd61ae9
P
193
194if __name__ == '__main__':
195 import sys
196
197 if len(sys.argv) != 4:
198 print >>sys.stderr, "Usage : %s <dossier-destination> <site> <formulaire>" % sys.argv[0]
199 sys.exit(1)
200
201 VHOST = sys.argv[2]
202 FORM_NAME = sys.argv[3]
203 OUTPUT_DIRECTORY = os.path.join(sys.argv[1], VHOST, FORM_NAME)
204
205 os.umask(0022)
206 # création du dossier d'extraction, au besoin
9d424de0
P
207 if not os.path.isdir(os.path.join(OUTPUT_DIRECTORY, 'data')):
208 os.makedirs(os.path.join(OUTPUT_DIRECTORY, 'data'), 0755)
0cd61ae9 209
a9de62a1
P
210 logging.basicConfig(level=logging.DEBUG,
211 format='%(asctime)s %(levelname)s %(message)s',
212 filename=os.path.join(OUTPUT_DIRECTORY, 'last-run.log'),
213 filemode='w')
214
215 logging.info('Début.')
216
0cd61ae9
P
217 pub = publisher.WcsPublisher.create_publisher()
218 pub.app_dir = os.path.join(pub.app_dir, VHOST)
219
220 formdef = FormDef.get_by_urlname(FORM_NAME)
221
222 extract_fields(formdef, OUTPUT_DIRECTORY)
223
224 extract_data(formdef, OUTPUT_DIRECTORY)
225
a9de62a1 226 logging.info('Fin.')
0f48356a 227