Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

New Legifrance (2020) layout #11

Open
wants to merge 12 commits into
base: master
Choose a base branch
from
14 changes: 7 additions & 7 deletions legipy/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
import sys

import click
import requests_cache


from legipy.models.base import LegipyModel
from legipy.services.code_service import CodeService
Expand All @@ -25,7 +27,7 @@ def json_serial(obj):


def current_legislature():
cur = [l for l in LegislatureService.legislatures() if l.end is None]
cur = [leg for leg in LegislatureService.legislatures() if leg.end is None]
return cur[0].number


Expand All @@ -49,8 +51,10 @@ def _dump_items(ary):


@click.group(short_help=u"Client for the `legifrance.gouv.fr` website.")
def cli():
pass
@click.option('--cache/--no-cache', default=False)
def cli(cache):
if cache:
requests_cache.install_cache('legipy_cache')


@cli.command(short_help=u"List published laws")
Expand Down Expand Up @@ -105,8 +109,6 @@ def codes():
@click.option('--with-articles/--without-articles', default=False,
help=u"Show details for each articles")
def code(id_code, date_pub, with_articles):
if date_pub:
date_pub = date_pub.replace('-', '') # 2018-02-01 => 20180201
_dump_item(
CodeService().code(id_code, date_pub, with_articles),
error='No such code: %s' % id_code
Expand All @@ -119,8 +121,6 @@ def code(id_code, date_pub, with_articles):
@click.option('--date-pub',
help=u"Publication date (ISO format), default to today")
def code_section(id_code, id_section, date_pub):
if date_pub:
date_pub = date_pub.replace('-', '') # 2018-02-01 => 20180201
_dump_item(SectionService().articles(id_code, id_section, date_pub))


Expand Down
32 changes: 24 additions & 8 deletions legipy/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import datetime
import re
from bs4.element import Tag

DOMAIN = 'www.legifrance.gouv.fr'

Expand Down Expand Up @@ -36,15 +37,7 @@
}


def servlet_url(servlet):
return 'https://%s/%s.do' % (DOMAIN, servlet)


def page_url(page):
return 'https://%s/%s.jsp' % (DOMAIN, page)


def new_page_url(page):
return 'https://%s/%s' % (DOMAIN, page)


Expand Down Expand Up @@ -87,3 +80,26 @@ def parse_roman(string):
total += value

return total


def find_all_non_nested(parent, *args, **kwargs):
""" find_all for non-nested elements

I.e. the same semantics as find_all(..., recursive=True),
except we don’t search children of matched nodes
"""
# Indulge python 2 ?
bfs = kwargs.pop('bfs', False)
kwargs['recursive'] = False

search = [parent]
found = []
while search:
node = search.pop(0 if bfs else -1)
found_at_node = node.find_all(*args, **kwargs)
found += found_at_node
search.extend(child for child in node.children if (
isinstance(child, Tag) and child not in found_at_node
))

return found
4 changes: 3 additions & 1 deletion legipy/models/code.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ def __init__(self,
class Article(LegipyModel):
def __init__(self,
title,
history):
history,
id_article=None):
self.title = title
self.history = history
self.id_article = id_article
142 changes: 68 additions & 74 deletions legipy/parsers/code_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,12 @@
import re

from bs4 import BeautifulSoup
from six.moves.urllib.parse import urljoin
from six.moves.urllib.parse import urljoin, urldefrag

from legipy.common import find_all_non_nested
from legipy.common import cleanup_url
from legipy.common import parse_date
from legipy.common import merge_spaces
from legipy.models.code import Article
from legipy.models.code import Code
from legipy.models.code import Section
Expand Down Expand Up @@ -41,13 +43,13 @@ def section_service(self):
return self._section_service

@classmethod
def parse_code_list(cls, html):
def parse_code_list(cls, url, html):
soup = BeautifulSoup(html, 'html5lib', from_encoding='utf-8')
form = soup.find('form', attrs={'action': '/rechCodeArticle.do'})
select = form.find('select', attrs={'name': 'cidTexte'})
return [Code(option.attrs['value'], option.get_text())
for option in select.find_all('option')
if option.attrs['value'] != '*']
codes = [code.find('a') for code in soup.find_all('h2')]
return [Code(re.sub('^id', '', code.attrs['id']),
code.get_text().strip(),
url_code=urljoin(url, code.attrs['href']))
for code in codes if code is not None]

def parse_code(self, url, html):
"""
Expand All @@ -63,87 +65,79 @@ def parse_code(self, url, html):
"""
soup = BeautifulSoup(html, 'html5lib', from_encoding='utf-8')

# -- main text
div = (soup
.find('div', id='content_false')
.find('div', attrs={'class': 'data'}))

code = Code(self.id_code,
date_pub=self.date_pub,
url_code=cleanup_url(url))

# -- Code title/subtitle
div_title = div.find('div', id='titreTexte')
span_subtitle = div_title.find('span',
attrs={'class': 'sousTitreTexte'})
if span_subtitle:
code.title = div_title.text.replace(span_subtitle.text, '')
code.subtitle = span_subtitle.text.strip()
regex = r'Version consolidée au (\d{1,2}(?:er)?\s+[^\s]+\s+\d{4})'
m = re.search(regex, code.subtitle)
if m:
code.date_pub = parse_date(m.group(1))

code.title = code.title.strip()
code.title = soup.h1.text.strip()
code.subtitle = soup.find('div', {'class': 'vigor-title'}).text.strip()
regex = (r'Version (?:en vigueur au|abrogée depuis le) '
r'(\d{1,2}(?:er)?\s+[^\s]+\s+\d{4})')
m = re.search(regex, code.subtitle)
if m:
code.date_pub = parse_date(m.group(1))

# -- TOC
code.children = [self.parse_code_ul(url, child)
for child in div.find_all('ul', recursive=False)]
toc = soup.find('ul', id='liste-sommaire')
code.children = [self.parse_toc_element(url, partie)
for partie in toc.find_all('li', recursive=False)]

return code

def parse_code_ul(self, url, ul):
def parse_toc_element(self, url, li):
"""Fill the toc item"""
li_list = ul.find_all('li', recursive=False)
li = li_list[0]
span_title = li.find('span',
attrs={'class': re.compile(r'TM\d+Code')},
recursive=False)

section = Section(span_title.attrs['id'], span_title.text.strip())
div_italic = li.find('div', attrs={'class': 'italic'}, recursive=False)
if div_italic:
section.content = div_italic.text.strip()
span_link = li.find('span',
attrs={'class': 'codeLienArt'},
recursive=False)
if span_link:
a_link = span_link.find('a', recursive=False)
if self.with_articles:
service = self.section_service
section.articles = service.articles(self.id_code,
section.id_section,
self.date_pub)
else:
section.articles = a_link.text.strip()
section.url_section = cleanup_url(
urljoin(url, a_link.attrs['href']))
section.children = [self.parse_code_ul(url, child)
for child in li.find_all('ul', recursive=False)]
a_link = li.find('a', attrs={'class': 'articleLink'}, recursive=False)

if a_link:
# cleanup_url(urljoin(url, a_link.attrs['href']))
return Article(a_link.text.strip(), None,
re.sub('^art', '', a_link.attrs['id']))

title = li.find(['span', 'a'], attrs={'class': 'title-link'},
recursive=False)

match = re.match(r'(.*?)(?: \((Articles .*)\))?$',
merge_spaces(title.text.strip()))
title_text, articles = match.groups()

section = Section(title.attrs['id'], title_text)

if 'href' in title.attrs:
section_url = urldefrag(urljoin(url, title.attrs['href']))[0]
section.url_section = urljoin(url, section_url)

for ul in find_all_non_nested(li, 'ul'):
for child_node in ul.find_all('li', recursive=False):
child = self.parse_toc_element(url, child_node)
if isinstance(child, Article) and self.with_articles:
if section.articles is None:
section.articles = []
section.articles.append(child)
elif isinstance(child, Section):
if section.children is None:
section.children = []
section.children.append(child)

if not section.children and not self.with_articles:
section.articles = articles

return section


def parser_articles(html):
def parser_articles(url, html):
soup = BeautifulSoup(html, 'html5lib', from_encoding='utf-8')
div = (soup
.find('div', id='content_false')
.find('div', attrs={'class': 'data'}))
div_list = div.find_all('div', attrs={'class': 'article'}, recursive=False)
articles = []
for div_article in div_list:
div_title = div_article.find('div',
attrs={'class': 'titreArt'},
recursive=False)
title = div_title.text
a_link = div_title.find('a')
if a_link:
title = title.replace(a_link.text, '')
title = title.strip()
div_history = div_article.find_all('div',
attrs={'class': 'histoArt'},
recursive=False)
article = Article(title,
[(entry.find('a') or entry.find('span')).text
for entry in div_history])
articles.append(article)
for article in soup.find_all('article'):
# Articles abrogés en h3
title = article.find(['h2', 'h3'])

# or title.attrs['data-anchor']
article_id = re.sub('(-[0-9])*$', '', title.attrs['id'])

# Only last modification
history = article.find('p', attrs={'class': 'date'})
history = history.text.strip() if history else None

articles.append(Article(title.text.strip(), history, article_id))
return articles
11 changes: 4 additions & 7 deletions legipy/parsers/law_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,7 @@ def parse_law(url, html, id_legi):
id_legi=id_legi
)

clean_title = merge_spaces(soup.h2.get_text()).strip()
law.title = re.sub(r'^Dossiers législatifs( - )?', '', clean_title).strip()
law.title = merge_spaces(soup.h1.get_text()).strip()

if len(law.title) == 0:
return None
Expand Down Expand Up @@ -53,15 +52,13 @@ def parse_law(url, html, id_legi):
if pub_date:
law.pub_date = parse_date(pub_date.group(1))

dos_senat = soup.find(lambda e: e.name == 'a' and (
re.search(r'/dossier-legislatif/', e['href']) or
re.search(r'/dossierleg/', e['href'])))
senat_url_re = re.compile(r'/dossierleg/|/dossier-legislatif/')
dos_senat = soup.find('a', href=senat_url_re)
if dos_senat:
law.url_senat = dos_senat['href'].split('#')[0]
law.id_senat = re.search(r'([^/]+)\.html$', law.url_senat).group(1)

dos_an = soup.find(lambda e: e.name == 'a' and
re.search(r'/dossiers/', e['href']))
dos_an = soup.find('a', href=re.compile(r'/dossiers/'))

if dos_an:
law.url_an = dos_an['href'].split('#')[0]
Expand Down
26 changes: 16 additions & 10 deletions legipy/parsers/pending_law_list_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,41 +4,47 @@
import re

from bs4 import BeautifulSoup
from six.moves.urllib.parse import parse_qs
from six.moves.urllib.parse import urljoin
from six.moves.urllib.parse import urlparse

from legipy.common import cleanup_url
from legipy.common import merge_spaces
from legipy.common import LAW_KINDS
from legipy.models.law import Law


def parse_pending_law_list(url, html):
def parse_pending_law_list(url, html, **law_kwargs):
soup = BeautifulSoup(html, 'html5lib', from_encoding='utf-8')
results = []

for year_header in soup.find_all('h3'):
year = int(year_header.get_text())
ul = year_header.find_next_sibling('ul')
for year_header in soup.find_all('h2'):
year = int(year_header.get_text().strip())
ul = year_header.find_next('ul')

if not ul:
continue

for law_entry in ul.select('li a'):
link_text = law_entry.get_text()
link_text = law_entry.get_text().strip()
nor_num = re.search(r'\(([A-Z0-9]+)\)$', link_text)

type_loi = re.match(r'(Projet|Proposition)\s+de\s+loi\s+({})?'
.format('|'.join(LAW_KINDS)), link_text)
if type_loi:
print(type_loi.groups())

url_legi = cleanup_url(urljoin(url, law_entry['href']))
qs_legi = parse_qs(urlparse(url_legi).query)
id_legi = urlparse(url_legi).path.strip('/').split('/')[-1]

results.append(Law(
year=year,
legislature=int(qs_legi['legislature'][0]),
type=qs_legi['typeLoi'][0],
id_legi=id_legi,
type=type_loi.group(0).lower()[:4],
kind=type_loi.group(1),
title=merge_spaces(link_text),
nor=nor_num.group(1) if nor_num else None,
url_legi=url_legi,
id_legi=qs_legi['idDocument'][0]
**law_kwargs
))

return results
Loading