3 from itertools
import chain
4 from urllib
import urlopen
5 from urlparse
import urljoin
7 from BeautifulSoup
import BeautifulSoup
9 from savoirs
.lib
.utils
import meta_set
13 'dc.description': 'description',
15 'dc.format': 'format',
16 'dc.identifier': 'uri',
17 'dc.language': 'language',
18 'dc.creator': 'creator',
19 'dc.contributor': 'contributor',
26 Moisonneur pour les systèmes Lodel 0.9.
29 BASE_URL
= options
['url']
32 url
= urljoin(BASE_URL
, path
)
36 return BeautifulSoup(html
)
40 uri
= urljoin(BASE_URL
, path
)
41 node
= {'identifier': uri
, 'uri': uri
}
42 for meta
in soup
.head('meta'):
43 name
= meta
.get('name')
44 content
= meta
.get('content')
45 if not (name
and content
):
47 field
= META_MAP
.get(name
.lower())
50 meta_set(node
, field
, content
)
53 index_soup
= get_soup('/')
55 a
['href'] for a
in chain
.from_iterable(
56 ul('a', href
=True) for ul
in index_soup('ul', 'typepersonne')
60 a
['href'] for a
in chain
.from_iterable(
61 get_soup(uri
)('a', 'auteur', href
=True) for uri
in auteur_index_uris
62 ) if a
.has_key('href')
65 a
['href'] for a
in chain
.from_iterable(
66 ul('a', href
=True) for ul
in index_soup('ul', 'issues')
67 ) if a
.has_key('href')
69 article_uris
= set(chain(
71 (a
['href'] for a
in chain
.from_iterable(
72 dl('a', href
=True) for dl
in chain
.from_iterable(
73 get_soup(uri
)('dl', 'listArticles') for uri
in auteur_uris
75 ) if a
.has_key('href')),
76 (a
['href'] for a
in chain
.from_iterable(
77 ul('a', href
=True) for ul
in chain
.from_iterable(
78 get_soup(uri
)('ul', 'summary') for uri
in numero_uris
80 ) if a
.has_key('href'))
82 nodes
= [get_node(uri
) for uri
in article_uris
]