9dde30cf0da54c5a4c23e8d9be54eea55b0628c0
[auf_savoirs_en_partage_django.git] / auf_savoirs_en_partage / savoirs / lib / recherche.py
1 # -*- encoding: utf-8 -*-
2 import urllib, httplib, time, simplejson, pprint, math, re
3 from django.core.urlresolvers import reverse
4 from django.conf import settings
5 from auf_savoirs_en_partage.backend_config import RESOURCES
6 from sep import SEP
7 from utils import smart_str
8 from savoirs.globals import configuration
9
10 def google_search (page, q, data):
11 params = {'q': q,
12 'rsz': 'large',
13 'v': '1.0',
14 'start': page * configuration['resultats_par_page'],
15 }
16
17 params['cref'] = configuration['google_xml'] % time.time()
18 url = "/ajax/services/search/web?" + urllib.urlencode (params)
19
20 handle = httplib.HTTPConnection ('ajax.googleapis.com')
21 handle.request ("GET", url)
22 r = handle.getresponse ()
23 response = simplejson.loads(r.read ())
24 #print pprint.pformat (params)
25 #print pprint.pformat (response)
26 handle.close ()
27
28 if len (response['responseData']['results']) > 0:
29 for i in response['responseData']['cursor']['pages']:
30 p = int (i['label']) - 1
31 if p > data['last_page']:
32 data['last_page'] = p
33
34 for r in response['responseData']['results']:
35 data['results'].append( {'uri': r['url'],
36 'content': r['content'],
37 'title': r['title']} )
38
39 data['more_link'] = response['responseData']['cursor']['moreResultsUrl']
40
41
42 def sep_build_content (regexp, description):
43 maxlen = 200
44 content = description
45 if len (description) > maxlen:
46 start = 0
47 loc = regexp.search (description)
48 if loc:
49 start = loc.start ()
50
51 f = start - (maxlen / 2)
52 t = 0
53 if f < 0:
54 t = -f
55 f = 0
56 t += start + (maxlen / 2)
57 if f > 0:
58 while description[f] != '.' and f > 0:
59 f -= 1
60 if f > 0:
61 f += 1
62 if t < len (description):
63 while t < len (description) and description[t] != '.':
64 t += 1
65 t += 1
66 content = description[f:t]
67 if f > 0:
68 content = "(...) " + content
69 if t < (len (description) - 1):
70 content = content + " (...)"
71 content = regexp.sub (r'\1<b>\2</b>\3', content)
72 return content
73
74
75 def make_regexp (q):
76 words = []
77 w = re.compile (r'\W+', re.U)
78 for k in q.keys ():
79 if k != 'operator':
80 words.extend(w.split (smart_str(q[k]).decode("utf-8")))
81 words = filter (lambda x: len(x)>2, words)
82 words.sort (lambda x,y: len(y)-len(x))
83
84 patt = "|".join (words)
85 patt = "([\W|-]{1})(" + patt + ")([\W|-]{1})"
86 return re.compile (patt, re.I|re.U)
87
88 def hl (r, string):
89 if string is not None:
90 return r.sub (r'\1<b>\2</b>\3', string)
91 return None
92
93
94 def sep_search (page, q, data):
95 f = page * configuration['resultats_par_page']
96 t = f + 8
97 s = SEP ()
98
99 matches = s.search (q)
100 data['last_page'] = math.ceil (float(len (matches)) / \
101 float(configuration['resultats_par_page'])) - 1
102 set = s.get (matches[f:t])
103 regexp = make_regexp (q)
104
105 for r in set:
106 uri = r.get ("uri", "")
107 if len (uri) == 0:
108 uri = r.get ("source")
109
110 serveur = RESOURCES[r.get('server')]['url']
111
112 # Récupère la source si ce n'est pas une URL
113 source = r.get("source", None)
114 if source is not None and source.startswith('http'):
115 source = None
116
117 title = r.get ("title", "")
118 content = sep_build_content (regexp, r.get ("description", ""))
119
120 contributeurs = r.get('contributor')
121 if contributeurs is not None:
122 contributeurs = "; ".join (contributeurs)
123
124 subject = r.get ('subject')
125 if subject is not None:
126 subject = ", ".join (subject)
127
128 data['results'].append ({
129 'uri': uri,
130 'getServeurURL': serveur,
131 'source' : source,
132 'id': r.get("id"), \
133 'title': hl(regexp, title),
134 'content': content, \
135 'creator': '; '.join([hl(regexp, x) for x in r.get('creator', [])]),
136 'contributors': hl(regexp, contributeurs),
137 'subject': hl(regexp, subject),
138 'modified': r.get('modified'),
139 'isbn': r.get('isbn'),
140 'admin_url': reverse('admin:savoirs_record_change', args=[r.get('id')])
141 })
142
143
144 def cherche (page, q, engin=None):
145 rc = {'results': [], 'last_page': 0, 'more_link': ''}
146
147 if engin is None:
148 engin = configuration['engin_recherche']
149
150 if engin == 'google':
151 google_search (page, q, rc)
152
153 elif engin == 'sep':
154 sep_search (page, {'q': q}, rc)
155
156 elif engin == 'avancee':
157 sep_search (page, q, rc)
158
159 return rc
160
161 def build_search_regexp(query):
162 """Construit une expression régulière qui peut servir à chercher les
163 mots-clés donnés dans 'query'."""
164 words = query.split()
165 if not words:
166 return None
167 parts = []
168 for word in words:
169 part = re.escape(word.lower())
170 # Les expressions régulières ne connaissent pas la version
171 # en majuscules des caractères accentués. :(
172 part = part.replace(u'à', u'[àÀ]')
173 part = part.replace(u'â', u'[âÂ]')
174 part = part.replace(u'é', u'[éÉ]')
175 part = part.replace(u'ê', u'[êÊ]')
176 part = part.replace(u'î', u'[îÎ]')
177 part = part.replace(u'ç', u'[çÇ]')
178
179 # Faire ceci après avoir traité les caractères accentués...
180 part = part.replace('a', u'[aàâÀÂ]')
181 part = part.replace('e', u'[eéèëêÉÊ]')
182 part = part.replace('i', u'[iïîÎ]')
183 part = part.replace('o', u'[oô]')
184 part = part.replace('u', u'[uûüù]')
185 part = part.replace('c', u'[cç]')
186
187 parts.append(part)
188 return re.compile('|'.join(parts), re.I)