fix pb encodage avec recherche SEP
[auf_savoirs_en_partage_django.git] / auf_savoirs_en_partage / savoirs / lib / recherche.py
CommitLineData
d15017b2
CR
1# -*- encoding: utf-8 -*-
2import urllib, httplib, time, simplejson, pprint, math, re
40a5ebfb 3from django.conf import settings
0cc5f772
CR
4from sep import SEP
5from utils import smart_str
ecc62927 6from savoirs.globals import configuration
d15017b2 7
f4db9f51 8
d15017b2
CR
9def google_search (page, q, data):
10 params = {'q': q,
11 'rsz': 'large',
12 'v': '1.0',
13 'start': page * configuration['resultats_par_page'],
d15017b2 14 }
40a5ebfb 15
4a988f75
CR
16 params['cref'] = configuration['google_xml'] % time.time()
17 url = "/ajax/services/search/web?" + urllib.urlencode (params)
18
d15017b2
CR
19 handle = httplib.HTTPConnection ('ajax.googleapis.com')
20 handle.request ("GET", url)
21 r = handle.getresponse ()
22 response = simplejson.loads(r.read ())
5ecd5424
CR
23 #print pprint.pformat (params)
24 #print pprint.pformat (response)
d15017b2
CR
25 handle.close ()
26
a008c7e5
CR
27 if len (response['responseData']['results']) > 0:
28 for i in response['responseData']['cursor']['pages']:
29 p = int (i['label']) - 1
30 if p > data['last_page']:
31 data['last_page'] = p
d15017b2 32
a008c7e5
CR
33 for r in response['responseData']['results']:
34 data['results'].append( {'uri': r['url'],
35 'content': r['content'],
36 'title': r['title']} )
d15017b2 37
a008c7e5 38 data['more_link'] = response['responseData']['cursor']['moreResultsUrl']
d15017b2
CR
39
40
41def sep_build_content (regexp, description):
42 maxlen = 200
4013f53d 43 content = description
d15017b2 44 if len (description) > maxlen:
544b4522 45 start = 0
d15017b2 46 loc = regexp.search (description)
4013f53d 47 if loc:
544b4522
CR
48 start = loc.start ()
49
50 f = start - (maxlen / 2)
51 t = 0
52 if f < 0:
53 t = -f
54 f = 0
55 t += start + (maxlen / 2)
56 if f > 0:
57 while description[f] != '.' and f > 0:
58 f -= 1
d15017b2 59 if f > 0:
544b4522
CR
60 f += 1
61 if t < len (description):
62 while t < len (description) and description[t] != '.':
d15017b2 63 t += 1
544b4522
CR
64 t += 1
65 content = description[f:t]
66 if f > 0:
67 content = "(...) " + content
68 if t < (len (description) - 1):
69 content = content + " (...)"
0cc5f772 70 content = regexp.sub (r'\1<b>\2</b>\3', content)
d15017b2
CR
71 return content
72
73
f4db9f51
CR
74def make_regexp (q):
75 words = []
76 w = re.compile (r'\W+', re.U)
77 for k in q.keys ():
78 if k != 'operator':
d0612c60
CR
79 words.extend(w.split (smart_str(q[k]).decode("utf-8")))
80 words = filter (lambda x: len(x)>2, words)
81 words.sort (lambda x,y: len(y)-len(x))
82
f4db9f51 83 patt = "|".join (words)
0c82bd73 84 patt = "([\W|-]{1})(" + patt + ")([\W|-]{1})"
d0612c60 85 return re.compile (patt, re.I|re.U)
f4db9f51
CR
86
87def hl (r, string):
88 if string is not None:
4d6ada8c 89 return r.sub (r'\1<b>\2</b>\3', string)
f4db9f51
CR
90 return None
91
92
d15017b2
CR
93def sep_search (page, q, data):
94 f = page * configuration['resultats_par_page']
95 t = f + 8
96 s = SEP ()
544b4522 97 matches = s.search (q)
d15017b2
CR
98 data['last_page'] = math.ceil (float(len (matches)) / \
99 float(configuration['resultats_par_page'])) - 1
100 set = s.get (matches[f:t])
f4db9f51
CR
101 regexp = make_regexp (q)
102
d15017b2
CR
103 for r in set:
104 uri = r.get ("source", "")
105 if len (uri) == 0:
106 uri = r.get ("uri")
f4db9f51 107
0cc5f772 108 title = r.get ("title", "")
40a5ebfb 109 content = sep_build_content (regexp, r.get ("description", ""))
d15017b2 110
f4db9f51
CR
111 contributeurs = r.get('contributor')
112 if contributeurs is not None:
113 contributeurs = "; ".join (contributeurs)
114
115 subject = r.get ('subject')
116 if subject is not None:
117 subject = ", ".join (subject)
118
119 data['results'].append ({'uri': uri,
120 'id': r.get("uri"), \
121 'title': hl(regexp, title),
0cc5f772 122 'content': content, \
8f17344b 123 'creator': '; '.join([hl(regexp, x) for x in r.get('creator', [])]),
f4db9f51
CR
124 'contributors': hl(regexp, contributeurs),
125 'subject': hl(regexp, subject),
126 'modified': r.get('modified'),
127 'isbn': r.get('isbn'),
128 })
d15017b2 129
d15017b2 130
a008c7e5 131def cherche (page, q, engin=None):
d15017b2 132 rc = {'results': [], 'last_page': 0, 'more_link': ''}
40a5ebfb 133
a008c7e5
CR
134 if engin is None:
135 engin = configuration['engin_recherche']
d15017b2
CR
136
137 if engin == 'google':
138 google_search (page, q, rc)
139
140 elif engin == 'sep':
4cb81c1c 141 sep_search (page, {'q': q}, rc)
544b4522
CR
142
143 elif engin == 'avancee':
d15017b2
CR
144 sep_search (page, q, rc)
145
146 return rc