Commit | Line | Data |
---|---|---|
d15017b2 CR |
1 | # -*- encoding: utf-8 -*- |
2 | import urllib, httplib, time, simplejson, pprint, math, re | |
c9f40fdd | 3 | from django.core.urlresolvers import reverse |
40a5ebfb | 4 | from django.conf import settings |
f98ad449 | 5 | from auf_savoirs_en_partage.backend_config import RESOURCES |
0cc5f772 CR |
6 | from sep import SEP |
7 | from utils import smart_str | |
ecc62927 | 8 | from savoirs.globals import configuration |
d15017b2 | 9 | |
0b1ddc11 EMS |
10 | def google_search (page, q): |
11 | data = {'results': [], 'last_page': 0, 'more_link': ''} | |
d15017b2 CR |
12 | params = {'q': q, |
13 | 'rsz': 'large', | |
14 | 'v': '1.0', | |
15 | 'start': page * configuration['resultats_par_page'], | |
d15017b2 | 16 | } |
40a5ebfb | 17 | |
4a988f75 CR |
18 | params['cref'] = configuration['google_xml'] % time.time() |
19 | url = "/ajax/services/search/web?" + urllib.urlencode (params) | |
20 | ||
d15017b2 CR |
21 | handle = httplib.HTTPConnection ('ajax.googleapis.com') |
22 | handle.request ("GET", url) | |
23 | r = handle.getresponse () | |
24 | response = simplejson.loads(r.read ()) | |
5ecd5424 CR |
25 | #print pprint.pformat (params) |
26 | #print pprint.pformat (response) | |
d15017b2 CR |
27 | handle.close () |
28 | ||
a008c7e5 CR |
29 | if len (response['responseData']['results']) > 0: |
30 | for i in response['responseData']['cursor']['pages']: | |
31 | p = int (i['label']) - 1 | |
32 | if p > data['last_page']: | |
33 | data['last_page'] = p | |
d15017b2 | 34 | |
a008c7e5 CR |
35 | for r in response['responseData']['results']: |
36 | data['results'].append( {'uri': r['url'], | |
37 | 'content': r['content'], | |
38 | 'title': r['title']} ) | |
d15017b2 | 39 | |
a008c7e5 | 40 | data['more_link'] = response['responseData']['cursor']['moreResultsUrl'] |
0b1ddc11 | 41 | return data |
d15017b2 CR |
42 | |
43 | def sep_build_content (regexp, description): | |
44 | maxlen = 200 | |
4013f53d | 45 | content = description |
d15017b2 | 46 | if len (description) > maxlen: |
544b4522 | 47 | start = 0 |
d15017b2 | 48 | loc = regexp.search (description) |
4013f53d | 49 | if loc: |
544b4522 CR |
50 | start = loc.start () |
51 | ||
52 | f = start - (maxlen / 2) | |
53 | t = 0 | |
54 | if f < 0: | |
55 | t = -f | |
56 | f = 0 | |
57 | t += start + (maxlen / 2) | |
58 | if f > 0: | |
59 | while description[f] != '.' and f > 0: | |
60 | f -= 1 | |
d15017b2 | 61 | if f > 0: |
544b4522 CR |
62 | f += 1 |
63 | if t < len (description): | |
64 | while t < len (description) and description[t] != '.': | |
d15017b2 | 65 | t += 1 |
544b4522 CR |
66 | t += 1 |
67 | content = description[f:t] | |
68 | if f > 0: | |
69 | content = "(...) " + content | |
70 | if t < (len (description) - 1): | |
71 | content = content + " (...)" | |
0cc5f772 | 72 | content = regexp.sub (r'\1<b>\2</b>\3', content) |
d15017b2 CR |
73 | return content |
74 | ||
75 | ||
f4db9f51 CR |
76 | def make_regexp (q): |
77 | words = [] | |
78 | w = re.compile (r'\W+', re.U) | |
79 | for k in q.keys (): | |
80 | if k != 'operator': | |
d0612c60 CR |
81 | words.extend(w.split (smart_str(q[k]).decode("utf-8"))) |
82 | words = filter (lambda x: len(x)>2, words) | |
83 | words.sort (lambda x,y: len(y)-len(x)) | |
84 | ||
f4db9f51 | 85 | patt = "|".join (words) |
0c82bd73 | 86 | patt = "([\W|-]{1})(" + patt + ")([\W|-]{1})" |
d0612c60 | 87 | return re.compile (patt, re.I|re.U) |
f4db9f51 CR |
88 | |
89 | def hl (r, string): | |
90 | if string is not None: | |
4d6ada8c | 91 | return r.sub (r'\1<b>\2</b>\3', string) |
f4db9f51 CR |
92 | return None |
93 | ||
94 | ||
d15017b2 CR |
95 | def sep_search (page, q, data): |
96 | f = page * configuration['resultats_par_page'] | |
97 | t = f + 8 | |
98 | s = SEP () | |
c9f40fdd | 99 | |
544b4522 | 100 | matches = s.search (q) |
d15017b2 CR |
101 | data['last_page'] = math.ceil (float(len (matches)) / \ |
102 | float(configuration['resultats_par_page'])) - 1 | |
103 | set = s.get (matches[f:t]) | |
f4db9f51 CR |
104 | regexp = make_regexp (q) |
105 | ||
d15017b2 | 106 | for r in set: |
c9f40fdd | 107 | uri = r.get ("uri", "") |
d15017b2 | 108 | if len (uri) == 0: |
c9f40fdd | 109 | uri = r.get ("source") |
110 | ||
f98ad449 | 111 | serveur = RESOURCES[r.get('server')]['url'] |
112 | ||
c9f40fdd | 113 | # Récupère la source si ce n'est pas une URL |
114 | source = r.get("source", None) | |
115 | if source is not None and source.startswith('http'): | |
116 | source = None | |
f4db9f51 | 117 | |
0cc5f772 | 118 | title = r.get ("title", "") |
40a5ebfb | 119 | content = sep_build_content (regexp, r.get ("description", "")) |
d15017b2 | 120 | |
f4db9f51 CR |
121 | contributeurs = r.get('contributor') |
122 | if contributeurs is not None: | |
123 | contributeurs = "; ".join (contributeurs) | |
124 | ||
125 | subject = r.get ('subject') | |
126 | if subject is not None: | |
127 | subject = ", ".join (subject) | |
128 | ||
c9f40fdd | 129 | data['results'].append ({ |
130 | 'uri': uri, | |
f98ad449 | 131 | 'getServeurURL': serveur, |
c9f40fdd | 132 | 'source' : source, |
133 | 'id': r.get("id"), \ | |
f4db9f51 | 134 | 'title': hl(regexp, title), |
0cc5f772 | 135 | 'content': content, \ |
8f17344b | 136 | 'creator': '; '.join([hl(regexp, x) for x in r.get('creator', [])]), |
f4db9f51 CR |
137 | 'contributors': hl(regexp, contributeurs), |
138 | 'subject': hl(regexp, subject), | |
139 | 'modified': r.get('modified'), | |
140 | 'isbn': r.get('isbn'), | |
c9f40fdd | 141 | 'admin_url': reverse('admin:savoirs_record_change', args=[r.get('id')]) |
f4db9f51 | 142 | }) |
d15017b2 | 143 | |
d15017b2 | 144 | |
a008c7e5 | 145 | def cherche (page, q, engin=None): |
d15017b2 | 146 | rc = {'results': [], 'last_page': 0, 'more_link': ''} |
40a5ebfb | 147 | |
a008c7e5 CR |
148 | if engin is None: |
149 | engin = configuration['engin_recherche'] | |
d15017b2 CR |
150 | |
151 | if engin == 'google': | |
0b1ddc11 | 152 | rc = google_search (page, q) |
d15017b2 CR |
153 | |
154 | elif engin == 'sep': | |
4cb81c1c | 155 | sep_search (page, {'q': q}, rc) |
544b4522 CR |
156 | |
157 | elif engin == 'avancee': | |
d15017b2 CR |
158 | sep_search (page, q, rc) |
159 | ||
160 | return rc | |
0b72ebef EMS |
161 | |
162 | def build_search_regexp(query): | |
163 | """Construit une expression régulière qui peut servir à chercher les | |
164 | mots-clés donnés dans 'query'.""" | |
165 | words = query.split() | |
166 | if not words: | |
167 | return None | |
168 | parts = [] | |
169 | for word in words: | |
170 | part = re.escape(word.lower()) | |
171 | # Les expressions régulières ne connaissent pas la version | |
172 | # en majuscules des caractères accentués. :( | |
82f25472 EMS |
173 | # Attention: re.escape aura ajouté un backslash devant tous les |
174 | # caractères accentués... | |
175 | part = part.replace(u'\\à', u'[àÀ]') | |
176 | part = part.replace(u'\\â', u'[âÂ]') | |
177 | part = part.replace(u'\\é', u'[éÉ]') | |
178 | part = part.replace(u'\\ê', u'[êÊ]') | |
179 | part = part.replace(u'\\î', u'[îÎ]') | |
180 | part = part.replace(u'\\ô', u'[ôÔ]') | |
181 | part = part.replace(u'\\ç', u'[çÇ]') | |
0b72ebef EMS |
182 | |
183 | # Faire ceci après avoir traité les caractères accentués... | |
184 | part = part.replace('a', u'[aàâÀÂ]') | |
185 | part = part.replace('e', u'[eéèëêÉÊ]') | |
186 | part = part.replace('i', u'[iïîÎ]') | |
82f25472 | 187 | part = part.replace('o', u'[oôÔ]') |
0b72ebef | 188 | part = part.replace('u', u'[uûüù]') |
4101cfc0 | 189 | part = part.replace('c', u'[cç]') |
0b72ebef EMS |
190 | |
191 | parts.append(part) | |
192 | return re.compile('|'.join(parts), re.I) |