"""
ConLang Grammar and Lexicon.
See https://www.nltk.org/book/ch05.html for the Parts-of-Speech Tags
Also, see https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
for a more complete set.
See https://www.cis.upenn.edu/~bies/manuals/tagguide.pdf
Also https://universaldependencies.org/u/pos/all.html
The tricky part is -- of course -- the grammar.
Our goal is to move from a tagged English parse tree to the ConLang tree through a number of transformations.
Input
::
(S (VP (TV kill) (NP (DET the) (N mage))))
In English, this Transitive Verb example doesn't have a subject only an object.
English permits ``S -> VP NP`` as well as more the more conventional ``S -> NP VP`` structures.
See https://ucrel.lancs.ac.uk/bnc2/bnc2guide.htm where VVI/VVZ is used for imperative.
The target ConLang in this implementation is ``VP NP NP``, ``VP NP PP``, ``VP NP PP NP PP`` kinds of things. Note the recursive
NP -> Det Nominal and Nominal -> Nominal Noun to permit constructing complicated relationships.
"go-Kill you the mage" sorts of constructs.
Present is the base form.
Imperative is a "go-" prefix.
Present participle ("-ing" in English) is a "now-" prefix.
Past ("-ed" in English) is a "did-" prefix.
No person conjugation. It's a separate word after the verb.
.. csv-table:: Verb Subcategories
Symbol, Meaning, Example
IV, intransitive verb, barked ``(VP -> IV Adj)``
TV, transitive verb, saw a man ``(VP -> TV NP)``
DatV, dative verb, gave a dog to a man ``(VP -> NP PP)``
SV, sentential verb, said that a dog barked ``(V -> SV S)``
Generally, this example ConLang lacks Intransitive verbs. "did-Bark the dog at something."
"did-See myself the man." "did-Give the dog to a man." "did-Say the man did-Bark the dog at something."
Rules needed
============
- Pivot Declarative to VP first. ``(S (NP $n) (VP $v) $x) -> (S (VP $v) (NP $n) $x)``.
Imperative ``(S (VP))`` left alone.
Interrogative ``(S (Aux NP VP))``, ``(S (Wh-NP VP))``, ``(S (Wh-NP Aux NP VP))`` should also be swapped around to verb first.
- Adverbs are split out with a helper. ``(VP $x (ADVP (NP $y))) -> (VP $x (PP with (NP $y)))``.
- The Verb Subcategories rules, above.
- Apply Tense Prefix to base Verb's stem word. ``(VP (MD $m) (VP $v)) -> (VP ${f|prefix(m)}))`` rewrite verb root with mode prefix.
- Apply noun determiner suffix to nounds ``(NP (DET $d) (N $n)) -> (NP ${n|suffix(d)})``.
"""
import argparse
import bisect
from collections import defaultdict, deque
import csv
from enum import Enum
import hashlib
import io
import itertools
from pprint import pprint, pformat
import random
import re
import sys
from textwrap import dedent
from typing import NamedTuple, TypeVar, List, Tuple, Dict, Callable, Union
### Weighted Choices.
CT = TypeVar("CT")
[docs]
def weighted_choice(source: List[Tuple[CT, int]]) -> CT:
"""Given [(string, int), ...] weighted strings, pick a string."""
choices, weights = zip(*source)
cumulative_dist = list(itertools.accumulate(weights))
selection = random.random()*cumulative_dist[-1]
return choices[bisect.bisect(cumulative_dist, selection)]
def test_weighted_choice_1() -> None:
source = [('a', 1), ('b', 1), ('c', 1), ('d', 1)]
random.seed(42)
examples = [weighted_choice(source) for _ in range(1000)]
assert examples[:10] == ['c', 'a', 'b', 'a', 'c', 'c', 'd', 'a', 'b', 'a']
from collections import Counter
distribution = Counter(examples)
assert distribution.most_common() == [('c', 263), ('d', 257), ('b', 242), ('a', 238)]
def test_weighted_choice_2() -> None:
source = [('a', 1), ('b', 2), ('c', 4), ('d', 8)]
random.seed(42)
examples = [weighted_choice(source) for _ in range(1000)]
assert examples[:10] == ['d', 'a', 'c', 'c', 'd', 'd', 'd', 'b', 'c', 'a']
from collections import Counter
distribution = Counter(examples)
assert distribution.most_common() == [('d', 551), ('c', 271), ('b', 126), ('a', 52)]
### Lexicon.
# For this application, weighted choices are strings.
WeightedChoices = List[Tuple[str, int]]
[docs]
class WordMaker:
"""Create words from a few rules.
1. Markov chains based on digraphs.
2. Initial Letter thrown on kind of randomly.
3. Seeded RNG from source word.
"""
# https://www.math.cornell.edu/~mec/2003-2004/cryptography/subs/digraphs.html
digraph_text = dedent('''\
Digraph Count Digraph Frequency
th 5532 th 1.52
he 4657 he 1.28
in 3429 in 0.94
er 3420 er 0.94
an 3005 an 0.82
re 2465 re 0.68
nd 2281 nd 0.63
at 2155 at 0.59
on 2086 on 0.57
nt 2058 nt 0.56
ha 2040 ha 0.56
es 2033 es 0.56
st 2009 st 0.55
en 2005 en 0.55
ed 1942 ed 0.53
to 1904 to 0.52
it 1822 it 0.50
ou 1820 ou 0.50
ea 1720 ea 0.47
hi 1690 hi 0.46
is 1660 is 0.46
or 1556 or 0.43
ti 1231 ti 0.34
as 1211 as 0.33
te 985 te 0.27
et 704 et 0.19
ng 668 ng 0.18
of 569 of 0.16
al 341 al 0.09
de 332 de 0.09
se 300 se 0.08
le 298 le 0.08
sa 215 sa 0.06
si 186 si 0.05
ar 157 ar 0.04
ve 148 ve 0.04
ra 137 ra 0.04
ld 64 ld 0.02
ur 60 ur 0.02
''')
# https://en.wikipedia.org/wiki/Letter_frequency
first_letter_text = dedent('''\
Letter Frequency
z 0.034%
y 1.620%
x 0.017%
w 6.753%
v 0.649%
u 1.487%
t 16.671%
s 7.755%
r 1.653%
q 0.173%
p 2.545%
o 6.264%
n 2.365%
m 4.383%
l 2.705%
k 0.590%
j 0.597%
i 6.286%
h 7.232%
g 1.950%
f 3.779%
e 2.007%
d 2.670%
c 3.511%
b 4.702%
a 11.602%
''')
# Real data: http://norvig.com/mayzner.html
# Handy approximation
# word lengths are 2 to 12, linear 200 down to 20 occurrences.
# (x, y) = (2, 200)
# (x, y) = (12, 20)
# m = Fraction(200-20)/Fraction(2-12) = -18
# b = 200-m*2 = 236
lengths = [(x, 236-18*x) for x in range(2,13)]
[docs]
def __init__(self, make_seed: Callable[[str], int]) -> None:
"""
Prepare the generator using a seed-creating function
and loading the frequency tables.
:param make_seed: a function to transform English word to a seed for generating a ConLang word.
"""
self.load_digraph_markov()
self.load_first_letters()
self.make_seed = make_seed
[docs]
def load_digraph_markov(self) -> None:
source_file = io.StringIO(self.digraph_text)
reader = csv.DictReader(source_file, delimiter='\t', skipinitialspace=True)
self.digraphs: WeightedChoices = list(
(row['Digraph'], int(row['Count']))
for row in reader
)
# The digraphs -- alone -- aren't very English like.
# Turning them into Markov Chains is more useful.
self.markov: Dict[str, WeightedChoices] = defaultdict(list)
for digraph, count in self.digraphs:
start, nextc = digraph
self.markov[start].append((nextc, count))
[docs]
def load_first_letters(self) -> None:
source_file = io.StringIO(self.first_letter_text)
reader = csv.DictReader(source_file, delimiter='\t', skipinitialspace=True)
self.first_letters = list((row['Letter'], 1000*float(row['Frequency'][:-1])) for row in reader)
[docs]
def word(self, seed: str = "") -> str:
"""Expand weighted Markov chains through the digraphs."""
if seed:
random.seed(self.make_seed(seed))
target_length = weighted_choice(self.lengths)
c_1, c_2 = weighted_choice(self.digraphs)
if target_length == 2:
w = [c_1, c_2]
else:
c_0 = weighted_choice(self.first_letters)
w = [c_0, c_1, c_2]
while len(w) < target_length:
if w[-1] in self.markov:
w.append(weighted_choice(self.markov[w[-1]]))
else:
target_length += 2
c_1, c_2 = weighted_choice(self.digraphs)
w.extend(["'", c_1, c_2])
return ''.join(w)
[docs]
def naive_seed(original: str) -> int:
"""Transform source word into RNG seed. Seems to have too many collisions."""
return sum(original.encode('ascii'))
[docs]
def hash_seed(original: str) -> int:
"""Transform source word into RNG seed."""
return sum(hashlib.sha1(original.encode('ascii')).digest())
def test_wordmaker() -> None:
m = WordMaker(lambda n: n)
words = [m.word(n) for n in range(1, 100, 10)]
assert words == ['is', 'aesth', 'to', 'he', 'tere', 'tnt', 'cesth', 'yeng', 'jonde', 'in']
random.seed(42)
lexicon = [m.word() for _ in range(5000)]
from collections import Counter
first = Counter(w[0] for w in lexicon)
pct = sorted(((letter, 100*count/5000) for letter, count in first.most_common()), reverse=True)
assert pct[:5] == [('z', 0.02), ('y', 1.2), ('x', 0.02), ('w', 4.62), ('v', 0.74)]
assert pct[-5:] == [('e', 4.7), ('d', 2.4), ('c', 3.38), ('b', 4.22), ('a', 11.22)]
### Grammar.
# We don't parse English into the tagged structure.
# The material to be translated needs to be marked up manually.
# (Use NLTK to create tagged structures https://www.nltk.org/api/nltk.parse.stanford.html)
[docs]
class Tag(NamedTuple):
"""Tagged text that forms a tree. (Similar to NLTK.Tree.)"""
pos: str # Part of Speech. "S", "NP", "VP", "ADVP", "MD", "TV", "IV", "DatV", "SV", "PP", "IN"
words: list['str | Tag'] # Terminal or Tag
[docs]
@staticmethod
def from_text(source: str) -> "Tag":
"""Parse (tag (tag word) (tag word)) kinds of structures."""
lex_pat = re.compile(r"\(|\)|\s+|[^\s()]+")
tokens = (t for t in lex_pat.findall(source) if not t.isspace())
symbols = deque(tokens)
return Tag.from_symbols(symbols)
[docs]
@staticmethod
def from_symbols(symbols: List[str]) -> "Tag":
assert symbols.popleft() == "(", f'Invalid structure {symbols}'
tag = symbols.popleft()
words = []
while symbols[0] != ")":
if symbols[0] == "(":
words.append(Tag.from_symbols(symbols))
else:
words.append(symbols.popleft())
assert symbols.popleft() == ")", f'Invalid structure {symbols}'
return Tag(tag, words)
def __str__(self) -> str:
text = []
for w in self.words:
if isinstance(w, str):
text.append(w)
else:
text.append(str(w))
return f"({self.pos} {' '.join(text)})"
def __repr__(self) -> str:
return f"Tag({self.pos!r}, [{', '.join(repr(c) for c in self.words)}])"
[docs]
def clean(self) -> str:
text = []
for w in self.words:
if isinstance(w, str):
text.append(w)
else:
text.append(w.clean())
return " ".join(text)
def test_tag() -> None:
src = '(S (VP (TV kill) (NP (DET the) (NP mage))))'
t = Tag.from_text(src)
assert t == Tag(pos='S', words=[
Tag(pos='VP', words=[
Tag(pos='TV', words=['kill']),
Tag(pos='NP', words=[
Tag(pos='DET', words=['the']),
Tag(pos='NP', words=['mage'])
])
])
])
assert str(t) == "(S (VP (TV kill) (NP (DET the) (NP mage))))"
assert t.clean() == "kill the mage"
def test_transform_rule():
rule1 = TransformRule("(S (NP $n) (VP $v $n2))", "(S (VP $v) (NP $n) (PP a $n2))")
sa = Tag.from_text("(S (NP I) (VP am (NP groot)))")
xform_sa1 = rule1.apply(sa)
assert xform_sa1 == Tag(pos='S', words=[
Tag(pos='VP', words=["am"]),
Tag(pos='NP', words=["I"]),
Tag(pos='PP', words=[
"a",
Tag(pos='NP', words=["groot"])
])
])
assert str(xform_sa1) == "(S (VP am) (NP I) (PP a (NP groot)))"
assert xform_sa1.clean() == "am I a groot"
def test_rule_pair():
rule1 = TransformRule("(S (NP (DET $d) (N $n)) (VP $v $n2))", "(S (VP $v) (NP $n $d) $n2)")
rule2 = TransformRule("(VP $x (ADVP (NP $y)))", "(VP $x (PP with (NP $y)))")
sb = Tag.from_text("(S (NP (DET the) (N mage)) (VP dies (ADVP (NP slowly))))")
xform_sb2 = rule2.apply(sb)
xform_sb1 = rule1.apply(xform_sb2)
assert str(xform_sb1) == "(S (VP dies) (NP mage the) (PP with (NP slowly)))"
assert xform_sb1.clean() == "dies mage the with slowly"
# Main App(s)
def translate(source: str, maker: WordMaker) -> tuple[Tag, list[str]]:
"""
Use the given WordMaker and transformation rules to translate a tagged sentence.
.. todo::
- R1-a. See test_transform_rule
- R1-b. See test_rule_pair, but reformat noun-determiner ``(NP ${n|suffix(d)})``
- R1-c. Special case with no determiner, inject generic "a".
- R2. Stem the adverb to drop the adverbial suffix "-ly", ``(VP ${v|stem|tense(m)})``
- R3. Rewrite individual verbs with mode/tense as suffix. ``(VP (MD $m) (VP $v)) -> (VP ${v|stem|tense(m)})`` add tense prefix to verb
- R4. Rewrite individual nouns with determiners ``(NP (DET $d) (N $n)) -> (NP ${n|suffix(d)})``
"""
# Special case of no-determiner.
rule1c = TransformRule("(S (NP $n) (VP $v $n2))",
"(S (VP $v) (NP $n) (PP a $n2))")
phrase = Tag.from_text(source)
xform_s1 = rule1c.apply(phrase)
words = (maker.word(w) for w in xform_s1.clean().split())
return xform_s1, list(words)
def get_options(args: List[str] = sys.argv[1:]) -> argparse.Namespace:
seed_algo = {'h': hash_seed, 'hash': hash_seed,
'n': naive_seed, 'naive': naive_seed}
parser = argparse.ArgumentParser()
parser.add_argument('-a', '--algorithm', type=str, action='store', default='hash')
parser.add_argument('-g', '--generate', type=int, action='store', default=None, help="spew words")
parser.add_argument('-w', '--words', type=int, action='store', default=None, help="translate words")
parser.add_argument('-t', '--translate', type=str, action='store', default=None, help="translate a tagged phrase")
options = parser.parse_args(args)
try:
options.algorithm = seed_algo[options.algorithm]
except KeyError as ex:
msg = "Alogorithm {} isn't known".format(ex)
raise argparse.ArgumentTypeError(msg)
return options
def main() -> None:
options = get_options()
maker = WordMaker(options.algorithm)
if options.generate is not None:
for i in range(options.generate):
print(maker.word())
if options.words is not None:
for w in options.words.split():
print(w, maker.word(w))
if options.translate is not None:
tagged, words = translate(options.translate, maker)
print(options.translate, tagged.clean(), words )
if __name__ == "__main__":
main()