RFCartography/rfcartography/index_parser.py
2023-01-03 14:42:54 +01:00

515 lines
22 KiB
Python

from enum import Enum, auto
from abc import ABC, abstractmethod
from datetime import date
from xml.etree.ElementTree import Element
from defusedxml.ElementTree import fromstring
class DocType(Enum):
RFC = 1
STD = 2
BCP = 3
FYI = 4
NIC = 5
IEN = 6
RTR = 7
def docID(self,
num: int) -> str:
if self.value < 5: # RFC, STD, BCP, FYI
return f"{self.name}{str(num).rjust(4, '0')}"
else: # NIC, IEN, RTR
return f"{self.name}{num}"
class Status(Enum):
INTERNET_STANDARD = auto()
DRAFT_STANDARD = auto()
PROPOSED_STANDARD = auto()
UNKNOWN = auto()
BEST_CURRENT_PRACTICE = auto()
FOR_YOUR_INFORMATION = auto()
EXPERIMENTAL = auto()
HISTORIC = auto()
INFORMATIONAL = auto()
class FileFormat(Enum):
ASCII = auto()
PS = auto()
PDF = auto()
TGZ = auto()
HTML = auto()
XML = auto()
TEXT = auto()
class Stream(Enum):
IETF = auto()
IAB = auto()
IRTF = auto()
INDEPENDENT = auto()
Editorial = auto()
Legacy = auto()
class Month(Enum):
January = 1
February = 2
March = 3
April = 4
May = 5
June = 6
July = 7
August = 8
September = 9
October = 10
November = 11
December = 12
class Author:
def __init__(self,
name: str,
title: str = "",
organization: str = "",
org_abbrev: str = ""):
self.name: str = name
self.title: str = title
self.organization: str = organization
self.org_abbrev: str = org_abbrev
return
class Document(ABC):
def __init__(self,
type: DocType,
number: int,
title: str = "",
is_also: list['Document'] = []):
self.type: DocType = type
self.number: int = number
self.title: str = title
self.is_also: list['Document'] = is_also
return
def docID(self) -> str:
return self.type.docID(self.number)
@abstractmethod
def update(self, **kwargs) -> 'Document':
pass
@abstractmethod
def get_references(self) -> list[tuple[str, 'Document']]:
pass
class RFC(Document):
def __init__(self,
number: int,
title: str = "",
authors: list[Author] = [],
pub_date: date = None,
current_status: Status = Status.UNKNOWN,
pub_status: Status = Status.UNKNOWN,
format: list[FileFormat] = [],
page_count: int = None,
keywords: list[str] = [],
abstract: list[str] = [],
draft: str = "",
notes: str = "",
obsoletes: list[Document] = [],
obsoleted_by: list[Document] = [],
updates: list[Document] = [],
updated_by: list[Document] = [],
is_also: list[Document] = [],
see_also: list[Document] = [],
stream: Stream = None,
area: str = "",
wg_acronym: str = "",
errata_url: str = "",
doi: str = ""):
super().__init__(DocType.RFC, number, title, is_also)
self.authors: list[Author] = authors
self.pub_date: date = pub_date
self.format: list[FileFormat] = format
self.page_count: int = page_count
self.keywords: list[str] = keywords
self.abstract: list[str] = abstract
self.draft: str = draft
self.notes: str = notes
self.obsoletes: list[Document] = obsoletes
self.obsoleted_by: list[Document] = obsoleted_by
self.updates: list[Document] = updates
self.updated_by: list[Document] = updated_by
self.see_also: list[Document] = see_also
self.current_status: Status = current_status
self.pub_status: Status = pub_status
self.stream: Stream = stream
self.area: str = area
self.wg_acronym: str = wg_acronym
self.errata_url: str = errata_url
self.doi: str = doi
return
def update(self, **kwargs) -> Document:
if 'title' in kwargs:
self.title = kwargs['title']
if 'authors' in kwargs:
self.authors = kwargs['authors']
if 'pub_date' in kwargs:
self.pub_date = kwargs["pub_date"]
if 'current_status' in kwargs:
self.current_status = kwargs["current_status"]
if 'pub_status' in kwargs:
self.pub_status = kwargs["pub_status"]
if 'format' in kwargs:
self.format = kwargs["format"]
if 'page_count' in kwargs:
self.page_count = kwargs["page_count"]
if 'keywords' in kwargs:
self.keywords = kwargs["keywords"]
if 'abstract' in kwargs:
self.abstract = kwargs["abstract"]
if 'draft' in kwargs:
self.draft = kwargs["draft"]
if 'notes' in kwargs:
self.notes = kwargs["notes"]
if 'obsoletes' in kwargs:
self.obsoletes = kwargs["obsoletes"]
if 'obsoleted_by' in kwargs:
self.obsoleted_by = kwargs["obsoleted_by"]
if 'updates' in kwargs:
self.updates = kwargs["updates"]
if 'updated_by' in kwargs:
self.updated_by = kwargs["updated_by"]
if 'is_also' in kwargs:
self.is_also = kwargs["is_also"]
if 'see_also' in kwargs:
self.see_also = kwargs["see_also"]
if 'stream' in kwargs:
self.stream = kwargs["stream"]
if 'area' in kwargs:
self.area = kwargs["area"]
if 'wg_acronym' in kwargs:
self.wg_acronym = kwargs["wg_acronym"]
if 'errata_url' in kwargs:
self.errata_url = kwargs["errata_url"]
if 'doi' in kwargs:
self.doi = kwargs["doi"]
return self
def get_references(self) -> list[tuple[str, Document]]:
reftypes: list[str] = ["obsoletes"]*len(self.obsoletes)\
+ ["obsoleted by"]*len(self.obsoleted_by)\
+ ["updates"]*len(self.updates)\
+ ["updated by"]*len(self.updated_by)\
+ ["is also"]*len(self.is_also)\
+ ["see also"]*len(self.see_also)
refs: list[Document] = self.obsoletes \
+ self.obsoleted_by \
+ self.updates \
+ self.updated_by \
+ self.is_also \
+ self.see_also
return list(zip(reftypes, refs))
class NotIssued(Document):
def __init__(self,
number: int):
super().__init__(DocType.RFC, number)
return
def update(self, **kwargs) -> Document:
return self
def get_references(self) -> list[tuple[str, Document]]:
return []
class STD(Document):
def __init__(self,
number: int,
title: str = "",
is_also: list[Document] = []):
super().__init__(DocType.STD, number, title, is_also)
return
def update(self, **kwargs) -> Document:
if 'title' in kwargs:
self.title = kwargs['title']
if 'is_also' in kwargs:
self.is_also = kwargs['is_also']
return self
def get_references(self) -> list[tuple[str, Document]]:
return list(zip(["is also"]*len(self.is_also), self.is_also))
class BCP(Document):
def __init__(self,
number: int,
title: str = "",
is_also: list[Document] = []):
super().__init__(DocType.BCP, number, title, is_also)
return
def update(self, **kwargs) -> Document:
if 'title' in kwargs:
self.title = kwargs['title']
if 'is_also' in kwargs:
self.is_also = kwargs['is_also']
return self
def get_references(self) -> list[tuple[str, Document]]:
return list(zip(["is also"]*len(self.is_also), self.is_also))
class FYI(Document):
def __init__(self,
number: int,
title: str = "",
is_also: list[Document] = []):
super().__init__(DocType.FYI, number, title, is_also)
return
def update(self, **kwargs) -> Document:
if 'title' in kwargs:
self.title = kwargs['title']
if 'is_also' in kwargs:
self.is_also = kwargs['is_also']
return self
def get_references(self) -> list[tuple[str, Document]]:
return list(zip(["is also"]*len(self.is_also), self.is_also))
class NIC(Document):
def __init__(self,
number: int):
super().__init__(DocType.NIC, number)
return
def update(self, **kwargs) -> Document:
return self
def get_references(self) -> list[tuple[str, Document]]:
return []
class IEN(Document):
def __init__(self,
number: int):
super().__init__(DocType.IEN, number)
return
def update(self, **kwargs) -> Document:
return self
def get_references(self) -> list[tuple[str, Document]]:
return []
class RTR(Document):
def __init__(self,
number: int):
super().__init__(DocType.RTR, number)
return
def update(self, **kwargs) -> Document:
return self
def get_references(self) -> list[tuple[str, Document]]:
return []
class IndexParser:
def __init__(self,
xml: str,
namespace: str = "http://www.rfc-editor.org/rfc-index"):
def _get_reflist(container: Element | None) -> list[Document]:
reflist: list[Document] = []
if container is not None:
for ref in container.findall(f"{{{namespace}}}doc-id"):
ref_type: str = DocType[ref.text[:3]]
ref_num: int = int(ref.text[3:])
if ref_num not in self.index[ref_type]:
if ref_type == DocType.RFC:
self.index[DocType.RFC][ref_num] = RFC(ref_num)
elif ref_type == DocType.STD:
self.index[DocType.STD][ref_num] = STD(ref_num)
elif ref_type == DocType.BCP:
self.index[DocType.BCP][ref_num] = BCP(ref_num)
elif ref_type == DocType.FYI:
self.index[DocType.FYI][ref_num] = FYI(ref_num)
elif ref_type == DocType.NIC:
self.index[DocType.NIC][ref_num] = NIC(ref_num)
elif ref_type == DocType.IEN:
self.index[DocType.IEN][ref_num] = IEN(ref_num)
else: # ref_type == DocType.RTR
self.index[DocType.RTR][ref_num] = RTR(ref_num)
reflist.append(self.index[ref_type][ref_num])
return reflist
self.index: dict[DocType: dict[int, Document]] = {DocType.RFC: {},
DocType.STD: {},
DocType.BCP: {},
DocType.FYI: {},
DocType.NIC: {},
DocType.IEN: {},
DocType.RTR: {}}
root: Element = fromstring(xml)
for child in root:
if child.tag == f"{{{namespace}}}rfc-entry":
docID: str = child.findtext(f"{{{namespace}}}doc-id")
number: int = int(docID[3:])
title: str = child.findtext(f"{{{namespace}}}title")
authors: list[Author] = []
for author in child.findall(f"{{{namespace}}}author"):
name: str = author.findtext(f"{{{namespace}}}name")
auth_title: str = author.findtext(f"{{{namespace}}}title", "")
org: str = author.findtext(f"{{{namespace}}}organization", "")
org_abbrev: str = author.findtext(f"{{{namespace}}}org-abbrev", "")
authors.append(Author(name, auth_title, org, org_abbrev))
tmp: Element | None = child.find(f"{{{namespace}}}date")
pub_year: int = int(tmp.findtext(f"{{{namespace}}}year"))
pub_month: int = Month[tmp.findtext(f"{{{namespace}}}month")].value
pub_day: int = int(tmp.findtext(f"{{{namespace}}}day", "1"))
pub_date: date = date(pub_year, pub_month, pub_day)
format: list[FileFormat] = []
tmp = child.find(f"{{{namespace}}}format")
if tmp is not None:
for file_format in tmp.findall(f"{{{namespace}}}file-format"):
format.append(FileFormat[file_format.text])
page_count: int = int(child.findtext(f"{{{namespace}}}page-count", "-1"))
if page_count < 0:
page_count = None
keywords: list[str] = []
tmp = child.find(f"{{{namespace}}}keywords")
if tmp is not None:
for kw in tmp.findall(f"{{{namespace}}}kw"):
keywords.append(kw.text)
abstract: list[str] = []
tmp = child.find(f"{{{namespace}}}abstract")
if tmp is not None:
for p in tmp.findall(f"{{{namespace}}}p"):
abstract.append(p.text)
draft: str = child.findtext(f"{{{namespace}}}draft", "")
notes: str = child.findtext(f"{{{namespace}}}notes", "")
tmp = child.find(f"{{{namespace}}}obsoletes")
obsoletes: list[Document] = _get_reflist(tmp)
tmp = child.find(f"{{{namespace}}}obsoleted-by")
obsoleted_by: list[Document] = _get_reflist(tmp)
tmp = child.find(f"{{{namespace}}}updates")
updates: list[Document] = _get_reflist(tmp)
tmp = child.find(f"{{{namespace}}}updated-by")
updated_by: list[Document] = _get_reflist(tmp)
tmp = child.find(f"{{{namespace}}}is-also")
is_also: list[Document] = _get_reflist(tmp)
tmp = child.find(f"{{{namespace}}}see-also")
see_also: list[Document] = _get_reflist(tmp)
current_status: Status = Status[child.findtext(f"{{{namespace}}}current-status").replace(" ", "_")]
pub_status: Status = Status[child.findtext(f"{{{namespace}}}publication-status").replace(" ", "_")]
stream: Stream = None
tmp = child.find(f"{{{namespace}}}stream")
if tmp is not None:
stream = Stream[tmp.text]
area: str = child.findtext(f"{{{namespace}}}area", "")
wg_acronym: str = child.findtext(f"{{{namespace}}}wg_acronym", "")
errata_url: str = child.findtext(f"{{{namespace}}}errata-url", "")
doi: str = child.findtext(f"{{{namespace}}}doi", "")
if number in self.index[DocType.RFC]:
self.index[DocType.RFC][number].update(title=title,
authors=authors,
pub_date=pub_date,
current_status=current_status,
pub_status=pub_status,
format=format,
page_count=page_count,
keywords=keywords,
abstract=abstract,
draft=draft,
notes=notes,
obsoletes=obsoletes,
obsoleted_by=obsoleted_by,
updates=updates,
updated_by=updated_by,
is_also=is_also,
see_also=see_also,
stream=stream,
area=area,
wg_acronym=wg_acronym,
errata_url=errata_url,
doi=doi)
else:
self.index[DocType.RFC][number] = RFC(number,
title,
authors,
pub_date,
current_status,
pub_status,
format,
page_count,
keywords,
abstract,
draft,
notes,
obsoletes,
obsoleted_by,
updates,
updated_by,
is_also,
see_also,
stream,
area,
wg_acronym,
errata_url,
doi)
continue
elif child.tag == f"{{{namespace}}}rfc-not-issued-entry":
docID: str = child.findtext(f"{{{namespace}}}doc-id")
number: int = int(docID[3:])
if number not in self.index[DocType.RFC]:
self.index[DocType.RFC][number] = NotIssued(number)
continue
elif child.tag == f"{{{namespace}}}std-entry":
docID: str = child.findtext(f"{{{namespace}}}doc-id")
number: int = int(docID[3:])
title: str = child.findtext(f"{{{namespace}}}title")
alias: Element = child.find(f"{{{namespace}}}is-also")
is_also: list[Document] = _get_reflist(alias)
if number in self.index[DocType.STD]:
self.index[DocType.STD][number].update(title=title, is_also=is_also)
else:
self.index[DocType.STD][number] = STD(number, title, is_also)
continue
elif child.tag == f"{{{namespace}}}bcp-entry":
docID: str = child.findtext(f"{{{namespace}}}doc-id")
number: int = int(docID[3:])
title: str = child.findtext(f"{{{namespace}}}title", "")
alias: Element = child.find(f"{{{namespace}}}is-also")
is_also: list[Document] = _get_reflist(alias)
if number in self.index[DocType.BCP]:
self.index[DocType.BCP][number].update(title=title, is_also=is_also)
else:
self.index[DocType.BCP][number] = BCP(number, title, is_also)
continue
elif child.tag == f"{{{namespace}}}fyi-entry":
docID: str = child.findtext(f"{{{namespace}}}doc-id")
number: int = int(docID[3:])
title: str = child.findtext(f"{{{namespace}}}title", "")
alias: Element = child.find(f"{{{namespace}}}is-also")
is_also: list[Document] = _get_reflist(alias)
if number in self.index[DocType.FYI]:
self.index[DocType.FYI][number].update(title=title, is_also=is_also)
else:
self.index[DocType.FYI][number] = FYI(number, title, is_also)
continue
return
def get_index(self) -> dict[DocType: dict[int, Document]]:
return self.index