Réparé la recherche par mots-clés
[auf_savoirs_en_partage_django.git] / auf_savoirs_en_partage / savoirs / lib / recherche.py
CommitLineData
d15017b2
CR
1# -*- encoding: utf-8 -*-
2import urllib, httplib, time, simplejson, pprint, math, re
c9f40fdd 3from django.core.urlresolvers import reverse
40a5ebfb 4from django.conf import settings
f98ad449 5from auf_savoirs_en_partage.backend_config import RESOURCES
0cc5f772
CR
6from sep import SEP
7from utils import smart_str
ecc62927 8from savoirs.globals import configuration
d15017b2 9
0b1ddc11
EMS
10def google_search (page, q):
11 data = {'results': [], 'last_page': 0, 'more_link': ''}
d15017b2
CR
12 params = {'q': q,
13 'rsz': 'large',
14 'v': '1.0',
15 'start': page * configuration['resultats_par_page'],
d15017b2 16 }
40a5ebfb 17
4a988f75
CR
18 params['cref'] = configuration['google_xml'] % time.time()
19 url = "/ajax/services/search/web?" + urllib.urlencode (params)
20
d15017b2
CR
21 handle = httplib.HTTPConnection ('ajax.googleapis.com')
22 handle.request ("GET", url)
23 r = handle.getresponse ()
24 response = simplejson.loads(r.read ())
5ecd5424
CR
25 #print pprint.pformat (params)
26 #print pprint.pformat (response)
d15017b2
CR
27 handle.close ()
28
a008c7e5
CR
29 if len (response['responseData']['results']) > 0:
30 for i in response['responseData']['cursor']['pages']:
31 p = int (i['label']) - 1
32 if p > data['last_page']:
33 data['last_page'] = p
d15017b2 34
a008c7e5
CR
35 for r in response['responseData']['results']:
36 data['results'].append( {'uri': r['url'],
37 'content': r['content'],
38 'title': r['title']} )
d15017b2 39
a008c7e5 40 data['more_link'] = response['responseData']['cursor']['moreResultsUrl']
0b1ddc11 41 return data
d15017b2
CR
42
43def sep_build_content (regexp, description):
44 maxlen = 200
4013f53d 45 content = description
d15017b2 46 if len (description) > maxlen:
544b4522 47 start = 0
d15017b2 48 loc = regexp.search (description)
4013f53d 49 if loc:
544b4522
CR
50 start = loc.start ()
51
52 f = start - (maxlen / 2)
53 t = 0
54 if f < 0:
55 t = -f
56 f = 0
57 t += start + (maxlen / 2)
58 if f > 0:
59 while description[f] != '.' and f > 0:
60 f -= 1
d15017b2 61 if f > 0:
544b4522
CR
62 f += 1
63 if t < len (description):
64 while t < len (description) and description[t] != '.':
d15017b2 65 t += 1
544b4522
CR
66 t += 1
67 content = description[f:t]
68 if f > 0:
69 content = "(...) " + content
70 if t < (len (description) - 1):
71 content = content + " (...)"
0cc5f772 72 content = regexp.sub (r'\1<b>\2</b>\3', content)
d15017b2
CR
73 return content
74
75
f4db9f51
CR
76def make_regexp (q):
77 words = []
78 w = re.compile (r'\W+', re.U)
79 for k in q.keys ():
80 if k != 'operator':
d0612c60
CR
81 words.extend(w.split (smart_str(q[k]).decode("utf-8")))
82 words = filter (lambda x: len(x)>2, words)
83 words.sort (lambda x,y: len(y)-len(x))
84
f4db9f51 85 patt = "|".join (words)
0c82bd73 86 patt = "([\W|-]{1})(" + patt + ")([\W|-]{1})"
d0612c60 87 return re.compile (patt, re.I|re.U)
f4db9f51
CR
88
89def hl (r, string):
90 if string is not None:
4d6ada8c 91 return r.sub (r'\1<b>\2</b>\3', string)
f4db9f51
CR
92 return None
93
94
d15017b2
CR
95def sep_search (page, q, data):
96 f = page * configuration['resultats_par_page']
97 t = f + 8
98 s = SEP ()
c9f40fdd 99
544b4522 100 matches = s.search (q)
d15017b2
CR
101 data['last_page'] = math.ceil (float(len (matches)) / \
102 float(configuration['resultats_par_page'])) - 1
103 set = s.get (matches[f:t])
f4db9f51
CR
104 regexp = make_regexp (q)
105
d15017b2 106 for r in set:
c9f40fdd 107 uri = r.get ("uri", "")
d15017b2 108 if len (uri) == 0:
c9f40fdd 109 uri = r.get ("source")
110
f98ad449 111 serveur = RESOURCES[r.get('server')]['url']
112
c9f40fdd 113 # Récupère la source si ce n'est pas une URL
114 source = r.get("source", None)
115 if source is not None and source.startswith('http'):
116 source = None
f4db9f51 117
0cc5f772 118 title = r.get ("title", "")
40a5ebfb 119 content = sep_build_content (regexp, r.get ("description", ""))
d15017b2 120
f4db9f51
CR
121 contributeurs = r.get('contributor')
122 if contributeurs is not None:
123 contributeurs = "; ".join (contributeurs)
124
125 subject = r.get ('subject')
126 if subject is not None:
127 subject = ", ".join (subject)
128
c9f40fdd 129 data['results'].append ({
130 'uri': uri,
f98ad449 131 'getServeurURL': serveur,
c9f40fdd 132 'source' : source,
133 'id': r.get("id"), \
f4db9f51 134 'title': hl(regexp, title),
0cc5f772 135 'content': content, \
8f17344b 136 'creator': '; '.join([hl(regexp, x) for x in r.get('creator', [])]),
f4db9f51
CR
137 'contributors': hl(regexp, contributeurs),
138 'subject': hl(regexp, subject),
139 'modified': r.get('modified'),
140 'isbn': r.get('isbn'),
c9f40fdd 141 'admin_url': reverse('admin:savoirs_record_change', args=[r.get('id')])
f4db9f51 142 })
d15017b2 143
d15017b2 144
a008c7e5 145def cherche (page, q, engin=None):
d15017b2 146 rc = {'results': [], 'last_page': 0, 'more_link': ''}
40a5ebfb 147
a008c7e5
CR
148 if engin is None:
149 engin = configuration['engin_recherche']
d15017b2
CR
150
151 if engin == 'google':
0b1ddc11 152 rc = google_search (page, q)
d15017b2
CR
153
154 elif engin == 'sep':
4cb81c1c 155 sep_search (page, {'q': q}, rc)
544b4522
CR
156
157 elif engin == 'avancee':
d15017b2
CR
158 sep_search (page, q, rc)
159
160 return rc
0b72ebef
EMS
161
162def build_search_regexp(query):
163 """Construit une expression régulière qui peut servir à chercher les
164 mots-clés donnés dans 'query'."""
165 words = query.split()
166 if not words:
167 return None
168 parts = []
169 for word in words:
170 part = re.escape(word.lower())
171 # Les expressions régulières ne connaissent pas la version
172 # en majuscules des caractères accentués. :(
82f25472
EMS
173 # Attention: re.escape aura ajouté un backslash devant tous les
174 # caractères accentués...
175 part = part.replace(u'\\à', u'[àÀ]')
176 part = part.replace(u'\\â', u'[âÂ]')
177 part = part.replace(u'\\é', u'[éÉ]')
178 part = part.replace(u'\\ê', u'[êÊ]')
179 part = part.replace(u'\\î', u'[îÎ]')
180 part = part.replace(u'\\ô', u'[ôÔ]')
181 part = part.replace(u'\\ç', u'[çÇ]')
0b72ebef
EMS
182
183 # Faire ceci après avoir traité les caractères accentués...
184 part = part.replace('a', u'[aàâÀÂ]')
185 part = part.replace('e', u'[eéèëêÉÊ]')
186 part = part.replace('i', u'[iïîÎ]')
82f25472 187 part = part.replace('o', u'[oôÔ]')
0b72ebef 188 part = part.replace('u', u'[uûüù]')
4101cfc0 189 part = part.replace('c', u'[cç]')
0b72ebef
EMS
190
191 parts.append(part)
192 return re.compile('|'.join(parts), re.I)