regardscitoyens · Cimbali · Apr 15, 2020 · Apr 28, 2021 · Apr 28, 2021 · Apr 28, 2021
diff --git a/legipy/cli.py b/legipy/cli.py
@@ -7,6 +7,8 @@
 import sys
 
 import click
+import requests_cache
+
 
 from legipy.models.base import LegipyModel
 from legipy.services.code_service import CodeService
@@ -25,7 +27,7 @@ def json_serial(obj):
 
 
 def current_legislature():
- cur = [l for l in LegislatureService.legislatures() if l.end is None]
+ cur = [leg for leg in LegislatureService.legislatures() if leg.end is None]
  return cur[0].number
 
 
@@ -49,8 +51,10 @@ def _dump_items(ary):
 
 
 @click.group(short_help=u"Client for the `legifrance.gouv.fr` website.")
-def cli():
- pass
+@click.option('--cache/--no-cache', default=False)
+def cli(cache):
+ if cache:
+ requests_cache.install_cache('legipy_cache')
 
 
 @cli.command(short_help=u"List published laws")
@@ -105,8 +109,6 @@ def codes():
 @click.option('--with-articles/--without-articles', default=False,
  help=u"Show details for each articles")
 def code(id_code, date_pub, with_articles):
- if date_pub:
- date_pub = date_pub.replace('-', '') # 2018-02-01 => 20180201
  _dump_item(
  CodeService().code(id_code, date_pub, with_articles),
  error='No such code: %s' % id_code
@@ -119,8 +121,6 @@ def code(id_code, date_pub, with_articles):
 @click.option('--date-pub',
  help=u"Publication date (ISO format), default to today")
 def code_section(id_code, id_section, date_pub):
- if date_pub:
- date_pub = date_pub.replace('-', '') # 2018-02-01 => 20180201
  _dump_item(SectionService().articles(id_code, id_section, date_pub))
 
 

diff --git a/legipy/common.py b/legipy/common.py
@@ -2,6 +2,7 @@
 
 import datetime
 import re
+from bs4.element import Tag
 
 DOMAIN = 'www.legifrance.gouv.fr'
 
@@ -36,15 +37,7 @@
 }
 
 
-def servlet_url(servlet):
- return 'https://%s/%s.do' % (DOMAIN, servlet)
-
-
 def page_url(page):
- return 'https://%s/%s.jsp' % (DOMAIN, page)
-
-
-def new_page_url(page):
  return 'https://%s/%s' % (DOMAIN, page)
 
 
@@ -87,3 +80,26 @@ def parse_roman(string):
  total += value
 
  return total
+
+
+def find_all_non_nested(parent, *args, **kwargs):
+ """ find_all for non-nested elements
+
+ I.e. the same semantics as find_all(..., recursive=True),
+ except we don’t search children of matched nodes
+ """
+ # Indulge python 2 ?
+ bfs = kwargs.pop('bfs', False)
+ kwargs['recursive'] = False
+
+ search = [parent]
+ found = []
+ while search:
+ node = search.pop(0 if bfs else -1)
+ found_at_node = node.find_all(*args, **kwargs)
+ found += found_at_node
+ search.extend(child for child in node.children if (
+ isinstance(child, Tag) and child not in found_at_node
+ ))
+
+ return found
diff --git a/legipy/models/code.py b/legipy/models/code.py
@@ -37,6 +37,8 @@ def __init__(self,
 class Article(LegipyModel):
  def __init__(self,
  title,
- history):
+ history,
+ id_article=None):
  self.title = title
  self.history = history
+ self.id_article = id_article
diff --git a/legipy/parsers/code_parser.py b/legipy/parsers/code_parser.py
@@ -4,10 +4,12 @@
 import re
 
 from bs4 import BeautifulSoup
-from six.moves.urllib.parse import urljoin
+from six.moves.urllib.parse import urljoin, urldefrag
 
+from legipy.common import find_all_non_nested
 from legipy.common import cleanup_url
 from legipy.common import parse_date
+from legipy.common import merge_spaces
 from legipy.models.code import Article
 from legipy.models.code import Code
 from legipy.models.code import Section
@@ -41,13 +43,13 @@ def section_service(self):
  return self._section_service
 
  @classmethod
- def parse_code_list(cls, html):
+ def parse_code_list(cls, url, html):
  soup = BeautifulSoup(html, 'html5lib', from_encoding='utf-8')
- form = soup.find('form', attrs={'action': '/rechCodeArticle.do'})
- select = form.find('select', attrs={'name': 'cidTexte'})
- return [Code(option.attrs['value'], option.get_text())
- for option in select.find_all('option')
- if option.attrs['value'] != '*']
+ codes = [code.find('a') for code in soup.find_all('h2')]
+ return [Code(re.sub('^id', '', code.attrs['id']),
+  code.get_text().strip(),
+  url_code=urljoin(url, code.attrs['href']))
+ for code in codes if code is not None]
 
  def parse_code(self, url, html):
  """
@@ -63,87 +65,79 @@ def parse_code(self, url, html):
  """
  soup = BeautifulSoup(html, 'html5lib', from_encoding='utf-8')
 
- # -- main text
- div = (soup
- .find('div', id='content_false')
- .find('div', attrs={'class': 'data'}))
-
  code = Code(self.id_code,
  date_pub=self.date_pub,
  url_code=cleanup_url(url))
 
  # -- Code title/subtitle
- div_title = div.find('div', id='titreTexte')
- span_subtitle = div_title.find('span',
- attrs={'class': 'sousTitreTexte'})
- if span_subtitle:
- code.title = div_title.text.replace(span_subtitle.text, '')
- code.subtitle = span_subtitle.text.strip()
- regex = r'Version consolidée au (\d{1,2}(?:er)?\s+[^\s]+\s+\d{4})'
- m = re.search(regex, code.subtitle)
- if m:
- code.date_pub = parse_date(m.group(1))
-
- code.title = code.title.strip()
+ code.title = soup.h1.text.strip()
+ code.subtitle = soup.find('div', {'class': 'vigor-title'}).text.strip()
+ regex = (r'Version (?:en vigueur au|abrogée depuis le) '
+ r'(\d{1,2}(?:er)?\s+[^\s]+\s+\d{4})')
+ m = re.search(regex, code.subtitle)
+ if m:
+ code.date_pub = parse_date(m.group(1))
 
  # -- TOC
- code.children = [self.parse_code_ul(url, child)
- for child in div.find_all('ul', recursive=False)]
+ toc = soup.find('ul', id='liste-sommaire')
+ code.children = [self.parse_toc_element(url, partie)
+ for partie in toc.find_all('li', recursive=False)]
 
  return code
 
- def parse_code_ul(self, url, ul):
+ def parse_toc_element(self, url, li):
  """Fill the toc item"""
- li_list = ul.find_all('li', recursive=False)
- li = li_list[0]
- span_title = li.find('span',
- attrs={'class': re.compile(r'TM\d+Code')},
- recursive=False)
-
- section = Section(span_title.attrs['id'], span_title.text.strip())
- div_italic = li.find('div', attrs={'class': 'italic'}, recursive=False)
- if div_italic:
- section.content = div_italic.text.strip()
- span_link = li.find('span',
- attrs={'class': 'codeLienArt'},
- recursive=False)
- if span_link:
- a_link = span_link.find('a', recursive=False)
- if self.with_articles:
- service = self.section_service
- section.articles = service.articles(self.id_code,
- section.id_section,
- self.date_pub)
- else:
- section.articles = a_link.text.strip()
- section.url_section = cleanup_url(
- urljoin(url, a_link.attrs['href']))
- section.children = [self.parse_code_ul(url, child)
- for child in li.find_all('ul', recursive=False)]
+ a_link = li.find('a', attrs={'class': 'articleLink'}, recursive=False)
+
+ if a_link:
+ # cleanup_url(urljoin(url, a_link.attrs['href']))
+ return Article(a_link.text.strip(), None,
+ re.sub('^art', '', a_link.attrs['id']))
+
+ title = li.find(['span', 'a'], attrs={'class': 'title-link'},
+ recursive=False)
+
+ match = re.match(r'(.*?)(?: \((Articles .*)\))?$',
+ merge_spaces(title.text.strip()))
+ title_text, articles = match.groups()
+
+ section = Section(title.attrs['id'], title_text)
+
+ if 'href' in title.attrs:
+ section_url = urldefrag(urljoin(url, title.attrs['href']))[0]
+ section.url_section = urljoin(url, section_url)
+
+ for ul in find_all_non_nested(li, 'ul'):
+ for child_node in ul.find_all('li', recursive=False):
+ child = self.parse_toc_element(url, child_node)
+ if isinstance(child, Article) and self.with_articles:
+ if section.articles is None:
+ section.articles = []
+ section.articles.append(child)
+ elif isinstance(child, Section):
+ if section.children is None:
+ section.children = []
+ section.children.append(child)
+
+ if not section.children and not self.with_articles:
+ section.articles = articles
+
  return section
 
 
-def parser_articles(html):
+def parser_articles(url, html):
  soup = BeautifulSoup(html, 'html5lib', from_encoding='utf-8')
- div = (soup
- .find('div', id='content_false')
- .find('div', attrs={'class': 'data'}))
- div_list = div.find_all('div', attrs={'class': 'article'}, recursive=False)
  articles = []
- for div_article in div_list:
- div_title = div_article.find('div',
- attrs={'class': 'titreArt'},
- recursive=False)
- title = div_title.text
- a_link = div_title.find('a')
- if a_link:
- title = title.replace(a_link.text, '')
- title = title.strip()
- div_history = div_article.find_all('div',
- attrs={'class': 'histoArt'},
- recursive=False)
- article = Article(title,
- [(entry.find('a') or entry.find('span')).text
- for entry in div_history])
- articles.append(article)
+ for article in soup.find_all('article'):
+ # Articles abrogés en h3
+ title = article.find(['h2', 'h3'])
+
+ # or title.attrs['data-anchor']
+ article_id = re.sub('(-[0-9])*$', '', title.attrs['id'])
+
+ # Only last modification
+ history = article.find('p', attrs={'class': 'date'})
+ history = history.text.strip() if history else None
+
+ articles.append(Article(title.text.strip(), history, article_id))
  return articles
diff --git a/legipy/parsers/law_parser.py b/legipy/parsers/law_parser.py
@@ -19,8 +19,7 @@ def parse_law(url, html, id_legi):
  id_legi=id_legi
  )
 
- clean_title = merge_spaces(soup.h2.get_text()).strip()
- law.title = re.sub(r'^Dossiers législatifs( - )?', '', clean_title).strip()
+ law.title = merge_spaces(soup.h1.get_text()).strip()
 
  if len(law.title) == 0:
  return None
@@ -53,15 +52,13 @@ def parse_law(url, html, id_legi):
  if pub_date:
  law.pub_date = parse_date(pub_date.group(1))
 
- dos_senat = soup.find(lambda e: e.name == 'a' and (
- re.search(r'/dossier-legislatif/', e['href']) or
- re.search(r'/dossierleg/', e['href'])))
+ senat_url_re = re.compile(r'/dossierleg/|/dossier-legislatif/')
+ dos_senat = soup.find('a', href=senat_url_re)
  if dos_senat:
  law.url_senat = dos_senat['href'].split('#')[0]
  law.id_senat = re.search(r'([^/]+)\.html$', law.url_senat).group(1)
 
- dos_an = soup.find(lambda e: e.name == 'a' and
- re.search(r'/dossiers/', e['href']))
+ dos_an = soup.find('a', href=re.compile(r'/dossiers/'))
 
  if dos_an:
  law.url_an = dos_an['href'].split('#')[0]

diff --git a/legipy/parsers/pending_law_list_parser.py b/legipy/parsers/pending_law_list_parser.py
@@ -4,41 +4,47 @@
 import re
 
 from bs4 import BeautifulSoup
-from six.moves.urllib.parse import parse_qs
 from six.moves.urllib.parse import urljoin
 from six.moves.urllib.parse import urlparse
 
 from legipy.common import cleanup_url
 from legipy.common import merge_spaces
+from legipy.common import LAW_KINDS
 from legipy.models.law import Law
 
 
-def parse_pending_law_list(url, html):
+def parse_pending_law_list(url, html, **law_kwargs):
  soup = BeautifulSoup(html, 'html5lib', from_encoding='utf-8')
  results = []
 
- for year_header in soup.find_all('h3'):
- year = int(year_header.get_text())
- ul = year_header.find_next_sibling('ul')
+ for year_header in soup.find_all('h2'):
+ year = int(year_header.get_text().strip())
+ ul = year_header.find_next('ul')
 
  if not ul:
  continue
 
  for law_entry in ul.select('li a'):
- link_text = law_entry.get_text()
+ link_text = law_entry.get_text().strip()
  nor_num = re.search(r'\(([A-Z0-9]+)\)$', link_text)
 
+ type_loi = re.match(r'(Projet|Proposition)\s+de\s+loi\s+({})?'
+ .format('|'.join(LAW_KINDS)), link_text)
+ if type_loi:
+ print(type_loi.groups())
+
  url_legi = cleanup_url(urljoin(url, law_entry['href']))
- qs_legi = parse_qs(urlparse(url_legi).query)
+ id_legi = urlparse(url_legi).path.strip('/').split('/')[-1]
 
  results.append(Law(
  year=year,
- legislature=int(qs_legi['legislature'][0]),
- type=qs_legi['typeLoi'][0],
+ id_legi=id_legi,
+ type=type_loi.group(0).lower()[:4],
+ kind=type_loi.group(1),
  title=merge_spaces(link_text),
  nor=nor_num.group(1) if nor_num else None,
  url_legi=url_legi,
- id_legi=qs_legi['idDocument'][0]
+ **law_kwargs
  ))
 
  return results