Source code for xml_parsing

"""
Core XML Parsing of RSS documents. This is generally focused on RSS 2.0
"""
import requests
from xml.etree import ElementTree
from pprint import pprint
import csv
from pathlib import Path
from urllib.parse import urlparse
import datetime
from typing import NamedTuple, List

[docs]class SourceRSS(NamedTuple): """Extract of raw RSS data""" title: str #: The title link: str #: The link description: str #: The description pubDate: str #: The publication date
[docs]class ExpandedRSS(NamedTuple): """ Data expanded by the :func:`title_transform()` function. Note that the names of the fields in this class will be the column titles on saved CSV files. Any change here will be reflected in the files created. """ title: str #: The title link: str #: The link description: str #: The description pubDate: str #: The publication date docket: str #: The parsed docket from the title parties_title: str #: The parsed parties from the title
[docs]def xml_reader(url: str) -> List[SourceRSS]: """ Extract RSS data given a URL to read. The root document is the ``<rss>`` tag which has a single ``<channel>`` tag. The ``<channel>`` has some overall attributes, but contains a sequence of ``<item>`` tags. This will gather "title", "link", "description", and "pubDate" from each item and build a :class:`SourceRSS` object. It might be helpful to return the overall channel properties along with the list of items. :param url: URL to read. :return: All of the SourceRSS from the channel of the feed, List[SourceRSS]. """ items = [] response = requests.get(url) rss = ElementTree.fromstring(response.content) channel = rss.find('channel') # Dump the overall channel properties print("title", channel.findtext('title')) print("link", channel.findtext('link')) print("description", channel.findtext('description')) print("last build date", channel.findtext('lastBuildDate')) for item in channel.iter('item'): item_row = SourceRSS( title=item.findtext('title'), link=item.findtext('link'), description=item.findtext('description'), pubDate=item.findtext('pubDate'), ) items.append(item_row) return items
[docs]def title_transform(items: List[SourceRSS]) -> List[ExpandedRSS]: """ A "transformation": this will parse titles for court docket RSS feeds. >>> from xml_parsing import title_transform, SourceRSS, ExpandedRSS The data is a list witha single document [SourceRSS()] >>> data = [ ... SourceRSS( ... title='1:15-cv-00791 SAVAGE v. BURWELL et al', ... link='https://ecf.dcd.uscourts.gov/cgi-bin/DktRpt.pl?172013', ... description='[Reply to opposition to motion] (<a href="https://ecf.dcd.uscourts.gov/doc1/04516660233?caseid=172013&de_seq_num=555" >137</a>)', ... pubDate='Thu, 05 Jul 2018 06:26:07 GMT' ... ), ... ] >>> title_transform(data) [ExpandedRSS(title='1:15-cv-00791 SAVAGE v. BURWELL et al', link='https://ecf.dcd.uscourts.gov/cgi-bin/DktRpt.pl?172013', description='[Reply to opposition to motion] (<a href="https://ecf.dcd.uscourts.gov/doc1/04516660233?caseid=172013&de_seq_num=555" >137</a>)', pubDate='Thu, 05 Jul 2018 06:26:07 GMT', docket='15-cv-00791', parties_title='SAVAGE v. BURWELL et al')] :param items: A list of :class:`SourceRSS` items built by :func:`xml_reader` :return: A new list of :class:`ExpandedRSS`, with some additional attributes for each item. """ new_items = [] for row in items: docket, _, parties_title = row.title.partition(' ') _, _, real_docket = docket.partition(":") result = ExpandedRSS( title=row.title, link=row.link, description=row.description, pubDate=row.pubDate, docket=real_docket, parties_title=parties_title, ) new_items.append(result) return new_items
[docs]def csv_dump(data: List[ExpandedRSS], output_path: Path) -> None: """ Save expanded data to a file, given the Path. Note that the headers are the field names from the ExpandedRSS class definition. This assures us that all fields will be written properly. :param data: List of :class:`ExpandedRSS` items, built by :func:`title_transform`. :param output_path: Path to which to write the file. """ with output_path.open('w', newline='') as output_file : headings = list(ExpandedRSS._fields) writer = csv.DictWriter(output_file, headings) writer.writeheader() for row in data: writer.writerow(row._asdict())
[docs]def csv_load(input_path: Path) -> List[ExpandedRSS]: """ Recover expanded data from a file, given a Path. Note that the headers **must be** the field names from the ExpandedRSS class definition. If their isn't a trivial match, then this won't read properly. :param input_path: Path from which to read the file. :returns: List of ExpandedRSS objects used to compare previous day's feed with today's feed. """ data = [] with input_path.open() as input_file: reader = csv.DictReader(input_file) for row in reader: expanded_rss = ExpandedRSS(**row) data.append(expanded_rss) return data
[docs]def path_maker(url: str, now: datetime.datetime=None, format: str="%Y%m%d") -> Path: """ Builds a Path from today's date and the base name of the URL. The default format is "%Y%m%d" to transform the date to a YYYYmmdd string. An alternative can be ""%Y%W%w" to create a YYYYWWw string, where WW is the week of the year and w is the day of the week. >>> from xml_parsing import path_maker >>> import datetime >>> now = datetime.datetime(2018, 9, 10) >>> str(path_maker("https://ecf.dcd.uscourts.gov/cgi-bin/rss_outside.pl", now)) '20180910/rss_outside' :param url: An RSS-feed URL. :param now: Optional date/time object. Defaults to datetime.datetime.now(). :return: A Path with the date string / base name from the URL. """ url_details = urlparse(url) base_name = Path(url_details.path).stem if not now: now = datetime.datetime.now() today_str = now.strftime(format) return Path(today_str) / base_name
[docs]def find_yesterday(directory: Path, url: str, date_pattern: str='[0-9]*') -> Path: """ We need to search for the most recent previous entry. While we can hope for dependably running this every day, that's a difficult thing to guarantee. It's much more reliable to look for the most recent date which contains files for a given channel. This means Example. Here's two dates. One date has one channel, the other has two channels. :: 20180630/one_channel/daily.csv 20180630/one_channel/new.csv 20180630/one_channel/save.csv 20180701/one_channel/daily.csv 20180701/one_channel/new.csv 20180701/one_channel/save.csv 20180701/another_channel/daily.csv 20180701/another_channel/new.csv 20180701/another_channel/save.csv If there's nothing available, returns None. :param directory: The base directory to search :param url: The full URL from which we can get the base name :param date_pattern: Most of the time, the interesting filenames will begin with a digit If the file name pattern is changed, however, this can be used to match dates, and exclude non-date files that might be confusing. :return: A Path with the date string / base name from the URL or None. """ url_details = urlparse(url) base_name = Path(url_details.path).stem candidates = list(directory.glob(f"{date_pattern}/{base_name}")) if candidates: return max(candidates)
def channel_processing(url: str, directory: Path=None, date: datetime.datetime=None): """ The daily process for a given channel. Ideally there's a "yesterday" directory. Pragmat :param url: The URL for the channel :param directory: The working directory, default is the current working directory. :param date: The date to assign to the files, by default, it's datetime.datetime.now. """ if directory is None: directory = Path.cwd() if date is None: date = datetime.datetime.now() yesterdays_path = find_yesterday(directory, url) todays_path = path_maker(url) todays_data = xml_reader(url) if yesterdays_path: saved_data = csv_load(yesterdays_path / "save.csv") else: saved_data = [] new_data = set(todays_data) - set(saved_data) all_data = set(todays_data) | set(saved_data) csv_dump(todays_data, todays_path / "daily.csv") csv_dump(new_data, todays_path / "new.csv") csv_dump(all_data, todays_path / "save.csv") def demo(): """ This is downloads, enriches, and saves the daily files to the current working directory. """ target_path = Path.cwd() data1 = xml_reader("https://ecf.dcd.uscourts.gov/cgi-bin/rss_outside.pl") data1_decomposed = title_transform(data1) # pprint(data1_decomposed) csv_dump(data1_decomposed, target_path / "file1.csv") data2 = xml_reader("https://ecf.nyed.uscourts.gov/cgi-bin/readyDockets.pl") data2_decomposed = title_transform(data2) # pprint(data2_decomposed) csv_dump(data2_decomposed, target_path / "file2.csv") recovered = csv_load(target_path / "file2.csv") assert set(recovered) == set(data2_decomposed), "Weird, recovering the file was a problem." if __name__ == "__main__": demo()