[#2205] Moissonnage de FAUN
[auf_savoirs_en_partage_django.git] / auf_savoirs_en_partage / savoirs / lib / harvesters / lodel / html09.py
1 # encoding: utf-8
2
3 from itertools import chain
4 from urllib import urlopen
5 from urlparse import urljoin
6
7 from BeautifulSoup import BeautifulSoup
8
9 from savoirs.lib.utils import meta_set
10
11 META_MAP = {
12 'dc.title': 'title',
13 'dc.description': 'description',
14 'dc.type': 'type',
15 'dc.format': 'format',
16 'dc.identifier': 'uri',
17 'dc.language': 'language',
18 'dc.creator': 'creator',
19 'dc.contributor': 'contributor',
20 'dc.date': 'issued',
21 }
22
23
24 def harvest(options):
25 """
26 Moisonneur pour les systèmes Lodel 0.9.
27 """
28
29 BASE_URL = options['url']
30
31 def get_soup(path):
32 url = urljoin(BASE_URL, path)
33 f = urlopen(url)
34 html = f.read()
35 f.close()
36 return BeautifulSoup(html)
37
38 def get_node(path):
39 soup = get_soup(path)
40 uri = urljoin(BASE_URL, path)
41 node = {'identifier': uri, 'uri': uri}
42 for meta in soup.head('meta'):
43 name = meta.get('name')
44 content = meta.get('content')
45 if not (name and content):
46 continue
47 field = META_MAP.get(name.lower())
48 if not field:
49 continue
50 meta_set(node, field, content)
51 return node
52
53 index_soup = get_soup('/')
54 auteur_index_uris = (
55 a['href'] for a in chain.from_iterable(
56 ul('a', href=True) for ul in index_soup('ul', 'typepersonne')
57 )
58 )
59 auteur_uris = (
60 a['href'] for a in chain.from_iterable(
61 get_soup(uri)('a', 'auteur', href=True) for uri in auteur_index_uris
62 ) if a.has_key('href')
63 )
64 numero_uris = (
65 a['href'] for a in chain.from_iterable(
66 ul('a', href=True) for ul in index_soup('ul', 'issues')
67 ) if a.has_key('href')
68 )
69 article_uris = set(chain(
70 numero_uris,
71 (a['href'] for a in chain.from_iterable(
72 dl('a', href=True) for dl in chain.from_iterable(
73 get_soup(uri)('dl', 'listArticles') for uri in auteur_uris
74 )
75 ) if a.has_key('href')),
76 (a['href'] for a in chain.from_iterable(
77 ul('a', href=True) for ul in chain.from_iterable(
78 get_soup(uri)('ul', 'summary') for uri in numero_uris
79 )
80 ) if a.has_key('href'))
81 ))
82 nodes = [get_node(uri) for uri in article_uris]
83 return nodes