Skip to content
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
# Change Log

## 0.7

- Add parsing to Gregorian date converter; supports month names (full or abbreviated)
in English, French, German, Spanish, Kinyarwanda, Ganda, and Tigrinya
- Include Gregorian dates in omnibus parser

## 0.6

- Experimental omnibus date converter + parser (EDTF, Hebrew, Hijri)
Expand Down
17 changes: 16 additions & 1 deletion DEVELOPER_NOTES.md
Original file line number Diff line number Diff line change
Expand Up @@ -88,4 +88,19 @@ pip install -e ".[docs]"
sphinx-build docs docs/_build
```

HTML documentation will be generated in `docs/_build/html`
HTML documentation will be generated in `docs/_build/html`


### Regenerating multilingual Gregorian month name parse file

The Gregorian Lark parser includes a script-generated file, which
populates month names based on a list of language codes using the Babel
library. To regenerate, run the script with hatch (which should
be installed globally)::

hatch run codegen:generate

When the `.lark` file is modified by the script, it must be committed to git.



6 changes: 6 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,12 @@ path = "src/undate/__init__.py"
[tool.hatch.build.targets.sdist]
include = ["src/undate/**/*.py", "src/undate/**/*.lark", "tests/**"]

[tool.hatch.envs.codegen]
dependencies = ["babel"]

[tool.hatch.envs.codegen.scripts]
generate = "python scripts/generate_gregorian_grammar.py"

[tool.pytest.ini_options]
pythonpath = "src/"
markers = [
Expand Down
75 changes: 75 additions & 0 deletions scripts/generate_gregorian_grammar.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
#!/usr/bin/env python
"""
This script generates the gregorian_multilang.lark file
with month names (full and abbreviated) based on the list of
target languages.

Run this script with hatch to regeneate the file::

hatch run codegen:generate

"""
Comment thread
coderabbitai[bot] marked this conversation as resolved.

from collections import defaultdict
import pathlib

from babel.dates import get_month_names

# lark grammar path relative to this script
GRAMMAR_DIR_PATH = (
pathlib.Path(__file__).parent.parent / "src" / "undate" / "converters" / "grammars"
)
# file that is generated by this script, in that directory
MONTH_GRAMMAR_FILE = GRAMMAR_DIR_PATH / "gregorian_multilang.lark"

# include month names in the following languages
languages = [
"en", # English
"es", # Spanish
"fr", # French
"de", # German
"rw", # Kinyarwanda
"lg", # Ganda
"ti", # Tigrinya
]

# warning to include at top of generated file
warning_text = """// WARNING: This file is auto-generated. DO NOT EDIT.
// To regenerate: hatch run codegen:generate

"""


def main():
# create a dictionary of lists to hold the names for each month
all_month_names = defaultdict(list)

for lang in languages:
for width in ["wide", "abbreviated"]:
for month_num, month_name in get_month_names(width, locale=lang).items():
# some locales use a . on the shortened month; let's ignore that
month_name = month_name.strip(".")
# In some cases different languages have the same abbreviations;
# in some cases, abbreviated and full are the same.
# Only add if not already present, to avoid redundancy
if month_name not in all_month_names[month_num]:
all_month_names[month_num].append(month_name)
Comment thread
coderabbitai[bot] marked this conversation as resolved.

with MONTH_GRAMMAR_FILE.open("w") as outfile:
outfile.write(warning_text)

# for each numeric month, generate a rule with all variant names:
# month_1: "January" | "Jan" ...
for i, names in all_month_names.items():
# combine all names in an OR string
or_names = " | ".join(f'"{m}"' for m in names)
outfile.write(f"month_{i}: {or_names}\n")

print(
f"Successfully regenerated {MONTH_GRAMMAR_FILE.relative_to(pathlib.Path.cwd())}"
)
print("If the file has changed, make sure to commit the new version.")


if __name__ == "__main__":
main()
3 changes: 3 additions & 0 deletions src/undate/converters/calendars/gregorian/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from undate.converters.calendars.gregorian.converter import GregorianDateConverter

__all__ = ["GregorianDateConverter"]
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
from calendar import monthrange, isleap

from lark.exceptions import UnexpectedCharacters

from undate.undate import Undate
from undate.converters.base import BaseCalendarConverter
from undate.converters.calendars.gregorian.parser import gregorian_parser
from undate.converters.calendars.gregorian.transformer import GregorianDateTransformer


class GregorianDateConverter(BaseCalendarConverter):
Expand All @@ -18,6 +23,9 @@ class GregorianDateConverter(BaseCalendarConverter):
#: arbitrary known leap year
LEAP_YEAR: int = 2024

def __init__(self):
self.transformer = GregorianDateTransformer()

def min_month(self) -> int:
"""First month for the Gregorian calendar."""
return 1
Expand Down Expand Up @@ -79,3 +87,25 @@ def to_gregorian(self, year, month, day) -> tuple[int, int, int]:
a common point of comparison.
"""
return (year, month, day)

def parse(self, value: str) -> Undate:
"""
Parse a Gregorian date string of any supported precision in any
supported language and return an :class:`~undate.undate.Undate`.
The input date string is preserved in the label of the resulting
Undate object.
"""
if not value:
raise ValueError("Parsing empty string is not supported")

# parse the input string, then transform to undate object
try:
# parse the string with our Hebrew date parser
Comment thread
coderabbitai[bot] marked this conversation as resolved.
Outdated
parsetree = gregorian_parser.parse(value)
# transform the parse tree into an undate object
undate_obj = self.transformer.transform(parsetree)
# set the original date string as the label
undate_obj.label = value
return undate_obj
except UnexpectedCharacters as err:
raise ValueError(f"Could not parse '{value}' as a Gregorian date") from err
Comment thread
coderabbitai[bot] marked this conversation as resolved.
Outdated
10 changes: 10 additions & 0 deletions src/undate/converters/calendars/gregorian/parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from lark import Lark

from undate.converters import GRAMMAR_FILE_PATH

grammar_path = GRAMMAR_FILE_PATH / "gregorian.lark"

# open based on filename to allow relative imports based on grammar file
gregorian_parser = Lark.open(
str(grammar_path), rel_to=__file__, start="gregorian_date", strict=True
)
42 changes: 42 additions & 0 deletions src/undate/converters/calendars/gregorian/transformer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
from lark import Transformer, Tree

from undate import Undate, Calendar


class GregorianDateTransformer(Transformer):
"""Transform a Gregorian date parse tree and return an Undate."""

# Currently parser should not result in intervals

calendar = Calendar.GREGORIAN

def gregorian_date(self, items):
parts = {}
for child in items:
if child.data in ["year", "month", "day"]:
# in each case we expect one integer value;
# anonymous tokens convert to their value and cast as int
value = int(child.children[0])
parts[str(child.data)] = value

# initialize and return an undate with year, month, day and
# Gregorian calendar
return Undate(**parts, calendar=self.calendar)

def year(self, items):
# combine multiple parts into a single string
value = "".join([str(i) for i in items])
return Tree(data="year", children=[value])

def month(self, items):
# month has a nested tree for the rule and the value
# the name of the rule (month_1, month_2, etc) gives us the
# number of the month needed for converting the date
tree = items[0]
month_n = tree.data.split("_")[-1]
return Tree(data="month", children=[month_n])

def day(self, items):
# combine multiple parts into a single string
value = "".join([str(i) for i in items])
return Tree(data="day", children=[value])
4 changes: 3 additions & 1 deletion src/undate/converters/combined.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from undate import Undate, UndateInterval
from undate.converters import BaseDateConverter, GRAMMAR_FILE_PATH
from undate.converters.edtf.transformer import EDTFTransformer
from undate.converters.calendars.gregorian.transformer import GregorianDateTransformer
from undate.converters.calendars.hebrew.transformer import HebrewDateTransformer
from undate.converters.calendars.islamic.transformer import IslamicDateTransformer

Expand All @@ -33,6 +34,7 @@ def start(self, children):
edtf=EDTFTransformer(),
hebrew=HebrewDateTransformer(),
islamic=IslamicDateTransformer(),
gregorian=GregorianDateTransformer(),
)


Expand All @@ -45,7 +47,7 @@ def start(self, children):
class OmnibusDateConverter(BaseDateConverter):
"""
Combination parser that aggregates existing parser grammars.
Currently supports EDTF, Hebrew, and Hijri where dates are unambiguous.
Currently supports EDTF, Gregorian, Hebrew, and Hijri where dates are unambiguous.
(Year-only dates are parsed as EDTF in Gregorian calendar.)

Does not support serialization.
Expand Down
18 changes: 15 additions & 3 deletions src/undate/converters/grammars/combined.lark
Original file line number Diff line number Diff line change
@@ -1,7 +1,12 @@
%import common.WS
%ignore WS

start: (edtf__start | hebrew__hebrew_date | islamic__islamic_date )
// Some abbreviations use periods; some default date formats
// include commas. Ignore both. (Copied from gregorian.lark)
PUNCTUATION: "." | ","
%ignore PUNCTUATION
Comment thread
rlskoeser marked this conversation as resolved.
Outdated

start: (edtf__start | hebrew__hebrew_date | islamic__islamic_date | gregorian__gregorian_date )

// Renaming of the import variables is required, as they receive the namespace of this file.
// See: https://github.com/lark-parser/lark/pull/973#issuecomment-907287565
Expand All @@ -23,10 +28,17 @@ start: (edtf__start | hebrew__hebrew_date | islamic__islamic_date )
%import .islamic.month -> islamic__month
%import .islamic.year -> islamic__year

// gregorian calendar, in multiple languages
%import .gregorian.gregorian_date -> gregorian__gregorian_date


// override hebrew date to omit year-only, since year without calendar is ambiguous
// NOTE: potentially support year with calendar label
%override hebrew__hebrew_date: hebrew__day hebrew__month hebrew__year | hebrew__month hebrew__year
%override hebrew__hebrew_date: hebrew__day hebrew__month hebrew__year | hebrew__month hebrew__year

// same for islamic date, year alone is ambiguous
%override islamic__islamic_date: islamic__day islamic__month islamic__year | islamic__month islamic__year
%override islamic__islamic_date: islamic__day islamic__month islamic__year | islamic__month islamic__year

// same as above. omit year only, since covered by EDTF
// %override gregorian__gregorian_date: day month year | month day year | year month day | month year | year month | day month | month day

38 changes: 38 additions & 0 deletions src/undate/converters/grammars/gregorian.lark
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
%import common.WS
%ignore WS

// Some abbreviations use periods; some default date formats
// include commas. Ignore both
PUNCTUATION: "." | ","
%ignore PUNCTUATION

%import .gregorian_multilang (month_1, month_2, month_3, month_4, month_5, \
month_6, month_7, month_8, month_9, month_10, month_11, month_12)


// no weekday support for now
gregorian_date: day month year | month day year | year month day | month year | year month | year | day month | month day

// months have 28 to 31 days; we do not expect leading zeroes
day: /[1-9]/ | /[12][0-9]/ | /3[0-1]/

// Gregorian calendar started in 1582; assume years with 3 or more digits for now,
// so we can support mixed day / year order unambiguously
year: /\b\d{3,}\b/
// Use word boundaries to separate from other tokens (esp. numeric day),
// since we otherwise ignore whitespace

// months
month: month_1
| month_2
| month_3
| month_4
| month_5
| month_6
| month_7
| month_8
| month_9
| month_10
| month_11
| month_12

15 changes: 15 additions & 0 deletions src/undate/converters/grammars/gregorian_multilang.lark
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
// WARNING: This file is auto-generated. DO NOT EDIT.
// To regenerate: hatch run codegen:generate

month_1: "January" | "Jan" | "enero" | "ene" | "janvier" | "janv" | "Januar" | "Mutarama" | "mut" | "Janwaliyo" | "ጥሪ"
month_2: "February" | "Feb" | "febrero" | "feb" | "février" | "févr" | "Februar" | "Gashyantare" | "gas" | "Febwaliyo" | "ለካቲት" | "ለካ"
month_3: "March" | "Mar" | "marzo" | "mar" | "mars" | "März" | "Werurwe" | "wer" | "Marisi" | "መጋቢት" | "መጋ"
month_4: "April" | "Apr" | "abril" | "abr" | "avril" | "avr" | "Mata" | "mat" | "Apuli" | "Apu" | "ሚያዝያ" | "ሚያ"
month_5: "May" | "mayo" | "may" | "mai" | "Mai" | "Gicurasi" | "gic" | "Maayi" | "Maa" | "ጉንበት" | "ግን"
month_6: "June" | "Jun" | "junio" | "jun" | "juin" | "Juni" | "Kamena" | "kam" | "Juuni" | "Juu" | "ሰነ"
month_7: "July" | "Jul" | "julio" | "jul" | "juillet" | "juil" | "Juli" | "Nyakanga" | "nya" | "Julaayi" | "ሓምለ" | "ሓም"
month_8: "August" | "Aug" | "agosto" | "ago" | "août" | "Kanama" | "kan" | "Agusito" | "Agu" | "ነሓሰ" | "ነሓ"
month_9: "September" | "Sep" | "septiembre" | "sept" | "septembre" | "Sept" | "Nzeri" | "nze" | "Sebuttemba" | "Seb" | "መስከረም" | "መስ"
month_10: "October" | "Oct" | "octubre" | "oct" | "octobre" | "Oktober" | "Okt" | "Ukwakira" | "ukw" | "Okitobba" | "Oki" | "ጥቅምቲ" | "ጥቅ"
month_11: "November" | "Nov" | "noviembre" | "nov" | "novembre" | "Ugushyingo" | "ugu" | "Novemba" | "ሕዳር" | "ሕዳ"
month_12: "December" | "Dec" | "diciembre" | "dic" | "décembre" | "déc" | "Dezember" | "Dez" | "Ukuboza" | "uku" | "Desemba" | "Des" | "ታሕሳስ" | "ታሕ"
Comment thread
coderabbitai[bot] marked this conversation as resolved.
Outdated
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from undate.date import DatePrecision
from undate.undate import Undate, Calendar
from undate.converters.calendars import GregorianDateConverter


Expand Down Expand Up @@ -38,3 +40,28 @@ def test_representative_years(self):
converter.LEAP_YEAR,
converter.NON_LEAP_YEAR,
]

def test_parse(self):
# day
date_str = "2022 Ugushyingo 26"
date = GregorianDateConverter().parse(date_str)
assert date == Undate(2022, 11, 26) # Ugushyingo = November
assert date.calendar == Calendar.GREGORIAN
assert date.precision == DatePrecision.DAY
assert date.label == date_str

# month
date_str = "avril 1362"
date = GregorianDateConverter().parse(date_str)
assert date == Undate(1362, 4)
assert date.calendar == Calendar.GREGORIAN
assert date.precision == DatePrecision.MONTH
assert date.label == date_str

# year
date_str = "932"
date = GregorianDateConverter().parse(date_str)
assert date == Undate(932)
assert date.calendar == Calendar.GREGORIAN
assert date.precision == DatePrecision.YEAR
assert date.label == date_str
Loading