Skip to content

Commit

Permalink
Merge branch 'main' into fix_colo_reset_variables
Browse files Browse the repository at this point in the history
  • Loading branch information
flooie authored Oct 21, 2024
2 parents 0cb0f2a + 96acad0 commit 302163a
Show file tree
Hide file tree
Showing 6 changed files with 118 additions and 96 deletions.
15 changes: 13 additions & 2 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,24 @@ Releases are also tagged in git, if that's helpful.

## Current

**2.6.30 - 2024-10-10**
**2.6.31 - 2024-10-21**

Fixes:
- fix `CADC` oral arguments
- `neb` now handles rows with no links
- `coloctapp` update cleanup_content
- fix `la` xpath selector that was skipping some cases

Features:
- new scraper `lactapp_5` for Lousiana Court of Appeals, Fifth Circuit
- now sending a `logger.error` call to Sentry when an scraped date is in the future

## Past

**2.6.30 - 2024-10-10**

Fixes:
- fix `CADC` oral arguments

**2.6.29 - 2024-10-10**

Fixes:
Expand Down
69 changes: 69 additions & 0 deletions juriscraper/lib/utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import re
from datetime import date, datetime
from itertools import chain, islice, tee

from juriscraper.AbstractSite import logger

from .string_utils import force_unicode


Expand Down Expand Up @@ -51,3 +54,69 @@ def clean_court_object(obj):
return re.sub(r"\s+,", ",", s)
else:
return obj


def backscrape_over_paginated_results(
url_template: str,
first_page: int,
last_page: int,
start_date: date,
end_date: date,
date_fmt: str,
site,
) -> list[dict]:
"""
Iterates over consecutive pages, looking for cases in a specific date range
Of use when the page offers no date filters, so one must look through all the pages
Assumes the page is returning results ordered by date
:param url_template: string to apply .format() to, like "url&page={}"
where the argument to pass will be the page number
:param first_page: integer of the first page
:param last_page: integer of the last page
:param start_date: cases with a date greater than this value will be collected
:param end_date: cases with a date lesses than this value will be collected
:param date_fmt: date format to parse case dates
:param site: the site object
:return: the list of cases between the dates
"""
cases = []

if isinstance(start_date, datetime):
start_date = start_date.date()
if isinstance(end_date, datetime):
end_date = end_date.date()

for page in range(first_page, last_page):
site.cases = [] # reset results container
site.url = url_template.format(page)
site.html = site._download()
site._process_html()

# results are ordered by desceding date
earliest = datetime.strptime(site.cases[-1]["date"], date_fmt).date()
latest = datetime.strptime(site.cases[0]["date"], date_fmt).date()
logger.info("Results page has date range %s to %s", earliest, latest)

# no intersection between date ranges
if max(earliest, start_date) >= min(latest, end_date):
# if earliest date from results is earlier than
# the start date, no need to iterate any further
if earliest < start_date:
logger.info(
"Finishing backscrape: earliest results date is %s earlier than start %s",
earliest,
start_date,
)
break
continue

# if there is an intersection, test every case and
# collect the matching cases
for case in site.cases:
case_date = datetime.strptime(case["date"], date_fmt).date()
if case_date < end_date and case_date > start_date:
cases.append(case)

return cases
49 changes: 21 additions & 28 deletions juriscraper/opinions/united_states/state/lactapp_1.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,24 +6,31 @@
2019-11-24: Created by mmantel
"""

import math
import re
from datetime import date, datetime

from juriscraper.AbstractSite import logger
from juriscraper.lib.html_utils import (
get_row_column_links,
get_row_column_text,
)
from juriscraper.lib.utils import backscrape_over_paginated_results
from juriscraper.OpinionSiteLinear import OpinionSiteLinear


class Site(OpinionSiteLinear):
first_opinion_date = datetime(2006, 11, 3)
# Ensure the backscrape iterable has a single item
days_interval = (datetime.today() - first_opinion_date).days + 2

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.court_id = self.__module__
self._page_size = 50
self._base_url = f"https://www.la-fcca.org/opiniongrid/opinionpub.php?opinionpage_size={self._page_size}"
self.url = self._base_url
self.back_scrape_iterable = self._generate_back_scrape_range()
page_size = 50
self.base_url = f"https://www.la-fcca.org/opiniongrid/opinionpub.php?opinionpage_size={page_size}"
self.url = self.base_url
self.make_backscrape_iterable(kwargs)
self.is_backscrape = False

# The opinions page does not indicate whether a case is
# published or unpublished. That is only found in the PDF.
Expand All @@ -37,7 +44,7 @@ def _process_html(self):
for row in self.html.cssselect("#opinion_contentTable tbody tr"):
self.cases.append(
{
"date": get_row_column_text(row, 1),
"date": get_row_column_text(row, 1).replace(" ", ""),
"docket": self._parse_docket_numbers(row),
"name": get_row_column_text(row, 4),
"url": get_row_column_links(row, 3),
Expand All @@ -54,25 +61,11 @@ def _parse_docket_numbers(self, row):
case_numbers = re.findall("[0-9]{4}[A-Z]{2}[0-9]{4}", text)
return ", ".join(case_numbers)

def _generate_back_scrape_range(self):
# This is a generator function, so this code won't run until a
# caller begins iterating, which is necessary because
# otherwise this would run during unit tests and trigger an
# unwanted network request.
last_page = self._get_last_page_number()

yield from range(1, last_page + 1)

def _get_last_page_number(self):
# The link to the last page has an onclick like:
# javascript:opinion_doPostBack('paging','','&opinionsort_field=sortdate&opinionsort_field_by=&opinionsort_field_type=&opinionsort_type=DESC&opinionpage_size=50&opinionp=395')
# where 395 is the last page number.
html = self._get_html_tree_by_url(self._base_url, {})
el = html.cssselect("a[title=last]")[0]
onclick = el.get("onclick")
return int(re.findall(r"\d+", onclick)[-1])

def _download_backwards(self, page):
self.url = self._base_url + ("&opinionp=%d" % page)
self.html = self._download()
self._process_html()
def _download_backwards(self, dates: tuple[date]) -> None:
logger.info("Backscraping for range %s %s", *dates)
url_template = f"{self.base_url}&opinionp={{}}"
start, end = dates
last_page = 500 # Real last page is 467 in Oct, 2024
self.cases = backscrape_over_paginated_results(
url_template, 2, last_page, start, end, "%m/%d/%Y", self
)
74 changes: 10 additions & 64 deletions juriscraper/opinions/united_states/state/nd.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

from juriscraper.AbstractSite import logger
from juriscraper.lib.string_utils import normalize_dashes
from juriscraper.lib.utils import backscrape_over_paginated_results
from juriscraper.OpinionSiteLinear import OpinionSiteLinear


Expand All @@ -21,7 +22,9 @@ class Site(OpinionSiteLinear):
"nature_of_suit",
"judge",
]
first_opinion_date = datetime(1955, 10, 25).date()
first_opinion_date = datetime(1955, 10, 25)
# Ensure the backscrape iterable has a single item
days_interval = (datetime.today() - first_opinion_date).days + 2

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
Expand Down Expand Up @@ -158,69 +161,12 @@ def _download_backwards(self, dates: Tuple[date]) -> None:
:param dates: (start_date, end_date) tuple
:return None
"""
logger.info("Backscraping for range %s %s", *dates)
start, end = dates
date_fmt = "%m/%d/%Y"
# last page is 118 (August 2024)
first_page, last_page = 2, 130
last_page = 130
base_url = self.url
cases = []

for page in range(first_page, last_page):
self.cases = [] # reset results container
self.url = f"{base_url}&page={page}"
self.html = self._download()
self._process_html()

# results are ordered by desceding date
earliest = datetime.strptime(
self.cases[-1]["date"], date_fmt
).date()
latest = datetime.strptime(self.cases[0]["date"], date_fmt).date()
logger.info(
"Results page has date range %s to %s", earliest, latest
)

# no intersection between date ranges
if max(earliest, start) >= min(latest, end):
# if earliest date from results is earlier than
# the start date, no need to iterate any further
if earliest < start:
logger.info(
"Finishing backscrape: earliest results date is %s earlier than start %s",
earliest,
start,
)
break
continue

# if there is an intersection, test every case and
# collect the matching cases
for case in self.cases:
case_date = datetime.strptime(case["date"], date_fmt).date()
if case_date < end and case_date > start:
cases.append(case)

self.cases = cases

def make_backscrape_iterable(self, kwargs: dict) -> None:
"""Checks if backscrape start and end arguments have been passed
by caller, and parses them accordingly
:param kwargs: passed when initializing the scraper, may or
may not contain backscrape controlling arguments
:return None
"""
start = kwargs.get("backscrape_start")
end = kwargs.get("backscrape_end")

if start:
start = datetime.strptime(start, "%m/%d/%Y").date()
else:
start = self.first_opinion_date
if end:
end = datetime.strptime(end, "%m/%d/%Y").date()
else:
end = datetime.now().date()

logger.info("Backscraping for cases between %s and %s", start, end)
self.back_scrape_iterable = [(start, end)]
url_template = f"{base_url}&page={{}}"
self.cases = backscrape_over_paginated_results(
url_template, 2, last_page, start, end, "%m/%d/%Y", self
)
5 changes: 4 additions & 1 deletion juriscraper/opinions/united_states/state/neb.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from juriscraper.AbstractSite import logger
from juriscraper.lib.html_utils import fix_links_in_lxml_tree
from juriscraper.OpinionSiteLinear import OpinionSiteLinear

Expand Down Expand Up @@ -50,8 +51,10 @@ def _process_html(self):
for row in table.xpath(".//tr[td]"):
c1, c2, c3 = row.xpath(".//td")
docket = c1.xpath(".//text()")[0].strip()
if "A-XX-XXXX" in docket:
if "A-XX-XXXX" in docket or not c3.xpath(".//a"):
logger.info("Skip row %s", row.text_content())
continue

citation = c2.xpath(".//text()")[0].strip()
name = c3.xpath(".//a/text()")[0].strip()
url = c3.xpath(".//a")[0].get("href")
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from setuptools import find_packages, setup
from setuptools.command.install import install

VERSION = "2.6.30"
VERSION = "2.6.31"
AUTHOR = "Free Law Project"
EMAIL = "[email protected]"
HERE = os.path.abspath(os.path.dirname(__file__))
Expand Down

0 comments on commit 302163a

Please sign in to comment.