correction 2 bugs
[auf_savoirs_en_partage_django.git] / auf_savoirs_en_partage_django / savoirs / lib / recherche.py
CommitLineData
d15017b2
CR
1# -*- encoding: utf-8 -*-
2import urllib, httplib, time, simplejson, pprint, math, re
40a5ebfb 3from django.conf import settings
d15017b2 4from auf_savoirs_en_partage_backend.sep.io import SEP
f4db9f51 5from auf_savoirs_en_partage_backend.sep.utils import smart_str
d15017b2
CR
6from savoirs import configuration
7
f4db9f51 8
d15017b2
CR
9def google_search (page, q, data):
10 params = {'q': q,
11 'rsz': 'large',
12 'v': '1.0',
13 'start': page * configuration['resultats_par_page'],
d15017b2 14 }
40a5ebfb
CR
15 if not settings.DEBUG:
16 #TODO: corriger ts
17 params['cref'] = "http://savoirsenpartage.auf.org/recherche.xml?%s" \
18 % int(time.time())
19
d15017b2
CR
20 url = "/ajax/services/search/web?" + \
21 urllib.urlencode (params)
22 handle = httplib.HTTPConnection ('ajax.googleapis.com')
23 handle.request ("GET", url)
24 r = handle.getresponse ()
25 response = simplejson.loads(r.read ())
5ecd5424
CR
26 #print pprint.pformat (params)
27 #print pprint.pformat (response)
d15017b2
CR
28 handle.close ()
29
a008c7e5
CR
30 if len (response['responseData']['results']) > 0:
31 for i in response['responseData']['cursor']['pages']:
32 p = int (i['label']) - 1
33 if p > data['last_page']:
34 data['last_page'] = p
d15017b2 35
a008c7e5
CR
36 for r in response['responseData']['results']:
37 data['results'].append( {'uri': r['url'],
38 'content': r['content'],
39 'title': r['title']} )
d15017b2 40
a008c7e5 41 data['more_link'] = response['responseData']['cursor']['moreResultsUrl']
d15017b2
CR
42
43
44def sep_build_content (regexp, description):
45 maxlen = 200
4013f53d 46 content = description
d15017b2 47 if len (description) > maxlen:
544b4522 48 start = 0
d15017b2 49 loc = regexp.search (description)
4013f53d 50 if loc:
544b4522
CR
51 start = loc.start ()
52
53 f = start - (maxlen / 2)
54 t = 0
55 if f < 0:
56 t = -f
57 f = 0
58 t += start + (maxlen / 2)
59 if f > 0:
60 while description[f] != '.' and f > 0:
61 f -= 1
d15017b2 62 if f > 0:
544b4522
CR
63 f += 1
64 if t < len (description):
65 while t < len (description) and description[t] != '.':
d15017b2 66 t += 1
544b4522
CR
67 t += 1
68 content = description[f:t]
69 if f > 0:
70 content = "(...) " + content
71 if t < (len (description) - 1):
72 content = content + " (...)"
d15017b2
CR
73 content = regexp.sub (r'<b>\1</b>', content)
74 return content
75
76
f4db9f51
CR
77def make_regexp (q):
78 words = []
79 w = re.compile (r'\W+', re.U)
80 for k in q.keys ():
81 if k != 'operator':
d0612c60
CR
82 words.extend(w.split (smart_str(q[k]).decode("utf-8")))
83 words = filter (lambda x: len(x)>2, words)
84 words.sort (lambda x,y: len(y)-len(x))
85
f4db9f51 86 patt = "|".join (words)
4d6ada8c 87 patt = "([\W]{1})(" + patt + ")([\W]{1})"
d0612c60 88 return re.compile (patt, re.I|re.U)
f4db9f51
CR
89
90def hl (r, string):
91 if string is not None:
4d6ada8c 92 return r.sub (r'\1<b>\2</b>\3', string)
f4db9f51
CR
93 return None
94
95
d15017b2
CR
96def sep_search (page, q, data):
97 f = page * configuration['resultats_par_page']
98 t = f + 8
99 s = SEP ()
544b4522 100 matches = s.search (q)
d15017b2
CR
101 data['last_page'] = math.ceil (float(len (matches)) / \
102 float(configuration['resultats_par_page'])) - 1
103 set = s.get (matches[f:t])
f4db9f51
CR
104 regexp = make_regexp (q)
105
d15017b2
CR
106 for r in set:
107 uri = r.get ("source", "")
108 if len (uri) == 0:
109 uri = r.get ("uri")
f4db9f51 110
4013f53d 111 title = regexp.sub (r'<b>\1</b>', r.get ("title", ""))
f4db9f51 112
40a5ebfb 113 content = sep_build_content (regexp, r.get ("description", ""))
d15017b2 114
f4db9f51
CR
115 contributeurs = r.get('contributor')
116 if contributeurs is not None:
117 contributeurs = "; ".join (contributeurs)
118
119 subject = r.get ('subject')
120 if subject is not None:
121 subject = ", ".join (subject)
122
123 data['results'].append ({'uri': uri,
124 'id': r.get("uri"), \
125 'title': hl(regexp, title),
126 'content': hl(regexp, content), \
7a430aa5 127 'creator': '; '.join([hl(regexp, x) for x in r.get('creator')]),
f4db9f51
CR
128 'contributors': hl(regexp, contributeurs),
129 'subject': hl(regexp, subject),
130 'modified': r.get('modified'),
131 'isbn': r.get('isbn'),
132 })
d15017b2 133
d15017b2 134
a008c7e5 135def cherche (page, q, engin=None):
d15017b2 136 rc = {'results': [], 'last_page': 0, 'more_link': ''}
40a5ebfb 137
a008c7e5
CR
138 if engin is None:
139 engin = configuration['engin_recherche']
d15017b2
CR
140
141 if engin == 'google':
142 google_search (page, q, rc)
143
144 elif engin == 'sep':
544b4522
CR
145 sep_search (page, {'q': q.encode ('utf-8')}, rc)
146
147 elif engin == 'avancee':
d15017b2
CR
148 sep_search (page, q, rc)
149
150 return rc