Source code for language

"""
ConLang Grammar and Lexicon.

See https://www.nltk.org/book/ch05.html for the Parts-of-Speech Tags

Also, see https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
for a more complete set.

See https://www.cis.upenn.edu/~bies/manuals/tagguide.pdf

Also https://universaldependencies.org/u/pos/all.html

The tricky part is -- of course -- the grammar.

Our goal is to move from a tagged English parse tree to the ConLang tree through a number of transformations.

Input
::

    (S (VP (TV kill) (NP (DET the) (N mage))))

In English, this Transitive Verb example doesn't have a subject only an object.
English permits ``S -> VP NP`` as well as more the more conventional ``S -> NP VP`` structures.

See https://ucrel.lancs.ac.uk/bnc2/bnc2guide.htm where VVI/VVZ is used for imperative.

The target ConLang in this implementation is ``VP NP NP``, ``VP NP PP``, ``VP NP PP NP PP`` kinds of things. Note the recursive
NP -> Det Nominal and Nominal -> Nominal Noun to permit constructing complicated relationships.

"go-Kill you the mage" sorts of constructs.
Present is the base form.
Imperative is a "go-" prefix.
Present participle ("-ing" in English) is a "now-" prefix.
Past ("-ed" in English) is a "did-" prefix.
No person conjugation. It's a separate word after the verb.

..  csv-table:: Verb Subcategories

    Symbol,	Meaning,	Example
    IV,	intransitive verb,	barked ``(VP -> IV Adj)``
    TV,	transitive verb,	saw a man ``(VP -> TV NP)``
    DatV,	dative verb,	gave a dog to a man ``(VP -> NP PP)``
    SV,	sentential verb,	said that a dog barked ``(V -> SV S)``

Generally, this example ConLang lacks Intransitive verbs. "did-Bark the dog at something."
"did-See myself the man." "did-Give the dog to a man." "did-Say the man did-Bark the dog at something."

Rules needed
============

- Pivot Declarative to VP first. ``(S (NP $n) (VP $v) $x) -> (S (VP $v) (NP $n) $x)``.
  Imperative ``(S (VP))`` left alone.
  Interrogative ``(S (Aux NP VP))``, ``(S (Wh-NP VP))``, ``(S (Wh-NP Aux NP VP))`` should also be swapped around to verb first.

- Adverbs are split out with a helper. ``(VP $x (ADVP (NP $y))) -> (VP $x (PP with (NP $y)))``.

- The Verb Subcategories rules, above.

- Apply Tense Prefix to base Verb's stem word. ``(VP (MD $m) (VP $v)) -> (VP ${f|prefix(m)}))`` rewrite verb root with mode prefix.

- Apply noun determiner suffix to nounds ``(NP (DET $d) (N $n)) -> (NP ${n|suffix(d)})``.

"""

import argparse
import bisect
from collections import defaultdict, deque
import csv
from enum import Enum
import hashlib
import io
import itertools
from pprint import pprint, pformat
import random
import re
import sys
from textwrap import dedent
from typing import NamedTuple, TypeVar, List, Tuple, Dict, Callable, Union

### Weighted Choices.

CT = TypeVar("CT")

[docs] def weighted_choice(source: List[Tuple[CT, int]]) -> CT: """Given [(string, int), ...] weighted strings, pick a string.""" choices, weights = zip(*source) cumulative_dist = list(itertools.accumulate(weights)) selection = random.random()*cumulative_dist[-1] return choices[bisect.bisect(cumulative_dist, selection)]
def test_weighted_choice_1() -> None: source = [('a', 1), ('b', 1), ('c', 1), ('d', 1)] random.seed(42) examples = [weighted_choice(source) for _ in range(1000)] assert examples[:10] == ['c', 'a', 'b', 'a', 'c', 'c', 'd', 'a', 'b', 'a'] from collections import Counter distribution = Counter(examples) assert distribution.most_common() == [('c', 263), ('d', 257), ('b', 242), ('a', 238)] def test_weighted_choice_2() -> None: source = [('a', 1), ('b', 2), ('c', 4), ('d', 8)] random.seed(42) examples = [weighted_choice(source) for _ in range(1000)] assert examples[:10] == ['d', 'a', 'c', 'c', 'd', 'd', 'd', 'b', 'c', 'a'] from collections import Counter distribution = Counter(examples) assert distribution.most_common() == [('d', 551), ('c', 271), ('b', 126), ('a', 52)] ### Lexicon. # For this application, weighted choices are strings. WeightedChoices = List[Tuple[str, int]]
[docs] class WordMaker: """Create words from a few rules. 1. Markov chains based on digraphs. 2. Initial Letter thrown on kind of randomly. 3. Seeded RNG from source word. """ # https://www.math.cornell.edu/~mec/2003-2004/cryptography/subs/digraphs.html digraph_text = dedent('''\ Digraph Count Digraph Frequency th 5532 th 1.52 he 4657 he 1.28 in 3429 in 0.94 er 3420 er 0.94 an 3005 an 0.82 re 2465 re 0.68 nd 2281 nd 0.63 at 2155 at 0.59 on 2086 on 0.57 nt 2058 nt 0.56 ha 2040 ha 0.56 es 2033 es 0.56 st 2009 st 0.55 en 2005 en 0.55 ed 1942 ed 0.53 to 1904 to 0.52 it 1822 it 0.50 ou 1820 ou 0.50 ea 1720 ea 0.47 hi 1690 hi 0.46 is 1660 is 0.46 or 1556 or 0.43 ti 1231 ti 0.34 as 1211 as 0.33 te 985 te 0.27 et 704 et 0.19 ng 668 ng 0.18 of 569 of 0.16 al 341 al 0.09 de 332 de 0.09 se 300 se 0.08 le 298 le 0.08 sa 215 sa 0.06 si 186 si 0.05 ar 157 ar 0.04 ve 148 ve 0.04 ra 137 ra 0.04 ld 64 ld 0.02 ur 60 ur 0.02 ''') # https://en.wikipedia.org/wiki/Letter_frequency first_letter_text = dedent('''\ Letter Frequency z 0.034% y 1.620% x 0.017% w 6.753% v 0.649% u 1.487% t 16.671% s 7.755% r 1.653% q 0.173% p 2.545% o 6.264% n 2.365% m 4.383% l 2.705% k 0.590% j 0.597% i 6.286% h 7.232% g 1.950% f 3.779% e 2.007% d 2.670% c 3.511% b 4.702% a 11.602% ''') # Real data: http://norvig.com/mayzner.html # Handy approximation # word lengths are 2 to 12, linear 200 down to 20 occurrences. # (x, y) = (2, 200) # (x, y) = (12, 20) # m = Fraction(200-20)/Fraction(2-12) = -18 # b = 200-m*2 = 236 lengths = [(x, 236-18*x) for x in range(2,13)]
[docs] def __init__(self, make_seed: Callable[[str], int]) -> None: """ Prepare the generator using a seed-creating function and loading the frequency tables. :param make_seed: a function to transform English word to a seed for generating a ConLang word. """ self.load_digraph_markov() self.load_first_letters() self.make_seed = make_seed
[docs] def load_digraph_markov(self) -> None: source_file = io.StringIO(self.digraph_text) reader = csv.DictReader(source_file, delimiter='\t', skipinitialspace=True) self.digraphs: WeightedChoices = list( (row['Digraph'], int(row['Count'])) for row in reader ) # The digraphs -- alone -- aren't very English like. # Turning them into Markov Chains is more useful. self.markov: Dict[str, WeightedChoices] = defaultdict(list) for digraph, count in self.digraphs: start, nextc = digraph self.markov[start].append((nextc, count))
[docs] def load_first_letters(self) -> None: source_file = io.StringIO(self.first_letter_text) reader = csv.DictReader(source_file, delimiter='\t', skipinitialspace=True) self.first_letters = list((row['Letter'], 1000*float(row['Frequency'][:-1])) for row in reader)
[docs] def word(self, seed: str = "") -> str: """Expand weighted Markov chains through the digraphs.""" if seed: random.seed(self.make_seed(seed)) target_length = weighted_choice(self.lengths) c_1, c_2 = weighted_choice(self.digraphs) if target_length == 2: w = [c_1, c_2] else: c_0 = weighted_choice(self.first_letters) w = [c_0, c_1, c_2] while len(w) < target_length: if w[-1] in self.markov: w.append(weighted_choice(self.markov[w[-1]])) else: target_length += 2 c_1, c_2 = weighted_choice(self.digraphs) w.extend(["'", c_1, c_2]) return ''.join(w)
[docs] def naive_seed(original: str) -> int: """Transform source word into RNG seed. Seems to have too many collisions.""" return sum(original.encode('ascii'))
[docs] def hash_seed(original: str) -> int: """Transform source word into RNG seed.""" return sum(hashlib.sha1(original.encode('ascii')).digest())
def test_wordmaker() -> None: m = WordMaker(lambda n: n) words = [m.word(n) for n in range(1, 100, 10)] assert words == ['is', 'aesth', 'to', 'he', 'tere', 'tnt', 'cesth', 'yeng', 'jonde', 'in'] random.seed(42) lexicon = [m.word() for _ in range(5000)] from collections import Counter first = Counter(w[0] for w in lexicon) pct = sorted(((letter, 100*count/5000) for letter, count in first.most_common()), reverse=True) assert pct[:5] == [('z', 0.02), ('y', 1.2), ('x', 0.02), ('w', 4.62), ('v', 0.74)] assert pct[-5:] == [('e', 4.7), ('d', 2.4), ('c', 3.38), ('b', 4.22), ('a', 11.22)] ### Grammar. # We don't parse English into the tagged structure. # The material to be translated needs to be marked up manually. # (Use NLTK to create tagged structures https://www.nltk.org/api/nltk.parse.stanford.html)
[docs] class Tag(NamedTuple): """Tagged text that forms a tree. (Similar to NLTK.Tree.)""" pos: str # Part of Speech. "S", "NP", "VP", "ADVP", "MD", "TV", "IV", "DatV", "SV", "PP", "IN" words: list['str | Tag'] # Terminal or Tag
[docs] @staticmethod def from_text(source: str) -> "Tag": """Parse (tag (tag word) (tag word)) kinds of structures.""" lex_pat = re.compile(r"\(|\)|\s+|[^\s()]+") tokens = (t for t in lex_pat.findall(source) if not t.isspace()) symbols = deque(tokens) return Tag.from_symbols(symbols)
[docs] @staticmethod def from_symbols(symbols: List[str]) -> "Tag": assert symbols.popleft() == "(", f'Invalid structure {symbols}' tag = symbols.popleft() words = [] while symbols[0] != ")": if symbols[0] == "(": words.append(Tag.from_symbols(symbols)) else: words.append(symbols.popleft()) assert symbols.popleft() == ")", f'Invalid structure {symbols}' return Tag(tag, words)
def __str__(self) -> str: text = [] for w in self.words: if isinstance(w, str): text.append(w) else: text.append(str(w)) return f"({self.pos} {' '.join(text)})" def __repr__(self) -> str: return f"Tag({self.pos!r}, [{', '.join(repr(c) for c in self.words)}])"
[docs] def clean(self) -> str: text = [] for w in self.words: if isinstance(w, str): text.append(w) else: text.append(w.clean()) return " ".join(text)
def test_tag() -> None: src = '(S (VP (TV kill) (NP (DET the) (NP mage))))' t = Tag.from_text(src) assert t == Tag(pos='S', words=[ Tag(pos='VP', words=[ Tag(pos='TV', words=['kill']), Tag(pos='NP', words=[ Tag(pos='DET', words=['the']), Tag(pos='NP', words=['mage']) ]) ]) ]) assert str(t) == "(S (VP (TV kill) (NP (DET the) (NP mage))))" assert t.clean() == "kill the mage"
[docs] class TransformRule: """Find a structure like (TAG $x $y) and transform to (TAG $y $x). This has a number of limitations, of course. Principally, it doesn't use Chomsky Normal Form. In CNF, every production having either two non-terminals or one terminal on the right-hand side. This seems similar to the way lambda calculus rewrites higher-arity operators as single-operand lambdas. .. todo:: Expand placeholders to include transformations like stemming a word. """ def __init__(self, source: str, target: str): self.tag_source = Tag.from_text(source) self.tag_target = Tag.from_text(target)
[docs] def match(self, tag_source: Tag, some_content: Tag) -> bool: """Match the entire tag pattern and the content being examined.""" if some_content.pos == tag_source.pos: # Descend to see if internal structure inside *also* matches child_matches = [] for src_child, content_child in itertools.zip_longest(tag_source.words, some_content.words): if not src_child or not content_child: # lengths do not match child_matches.append(False) elif isinstance(src_child, str): # Two kinds of strings, placeholders and terminals. if src_child.startswith("$"): # Placeholder, e.g. "$x", matches anything. child_matches.append(True) else: # Terminals must match child_matches.append(src_child.lower() == content_child.lower()) elif isinstance(src_child, Tag): # Tag, e.g. ("PP", [words]) child_matches.append(self.match(src_child, content_child)) else: raise ValueError(f"{src_child} in {tag_source.words}") return all(child_matches) return False
[docs] def placeholders(self, tag_source: Tag, some_content: Tag, variables: Dict[str, Union[Tag, str]]) -> None: """ Given a pattern and tagged content, locate and assign values to $ placeholders. This does a recursive depth-first search. .. todo:: Tolerate ``${w|stem}`` which will apply a ``stem()`` function to ``w``. """ for src_child, content_child in zip(tag_source.words, some_content.words): if isinstance(src_child, str): if src_child.startswith("$"): variables[src_child] = content_child else: # Tag, e.g. ("PP", [words]) self.placeholders(src_child, content_child, variables)
[docs] def emit(self, tag_target: Tag, variables: Dict[str, Union[Tag, str]]) -> Tag: """Replace ``$`` placeholders with values. Apply functions like ``${x|stem}``""" rewrite = Tag(tag_target.pos, []) for w in tag_target.words: if isinstance(w, str): if w.startswith("$"): rewrite.words.append(variables.get(w, Tag("X", ["?"]))) else: # Literal rewrite.words.append(w) else: # Tag rewrite.words.append(self.emit(w, variables)) return rewrite
[docs] def apply(self, some_content: Tag) -> Tag: """ Apply this rule to some tagged content. """ if self.match(self.tag_source, some_content): # Return revised content. variables: Dict[str, Union[Tag, str]] = {} self.placeholders(self.tag_source, some_content, variables) return self.emit(self.tag_target, variables) # Descend into structure looking for a match. # Return a copy of the original with ONLY matching children changed. rewrite = Tag(some_content.pos, []) for w in some_content.words: # RECURSIVE DESCENT -- depth-first. if isinstance(w, str): # Terminal is untouched rewrite.words.append(w) elif isinstance(w, Tag): # Tag might be rewritten rewrite.words.append(self.apply(w)) else: raise ValueError(f"{w} in {some_content}") return rewrite
def test_transform_rule(): rule1 = TransformRule("(S (NP $n) (VP $v $n2))", "(S (VP $v) (NP $n) (PP a $n2))") sa = Tag.from_text("(S (NP I) (VP am (NP groot)))") xform_sa1 = rule1.apply(sa) assert xform_sa1 == Tag(pos='S', words=[ Tag(pos='VP', words=["am"]), Tag(pos='NP', words=["I"]), Tag(pos='PP', words=[ "a", Tag(pos='NP', words=["groot"]) ]) ]) assert str(xform_sa1) == "(S (VP am) (NP I) (PP a (NP groot)))" assert xform_sa1.clean() == "am I a groot" def test_rule_pair(): rule1 = TransformRule("(S (NP (DET $d) (N $n)) (VP $v $n2))", "(S (VP $v) (NP $n $d) $n2)") rule2 = TransformRule("(VP $x (ADVP (NP $y)))", "(VP $x (PP with (NP $y)))") sb = Tag.from_text("(S (NP (DET the) (N mage)) (VP dies (ADVP (NP slowly))))") xform_sb2 = rule2.apply(sb) xform_sb1 = rule1.apply(xform_sb2) assert str(xform_sb1) == "(S (VP dies) (NP mage the) (PP with (NP slowly)))" assert xform_sb1.clean() == "dies mage the with slowly" # Main App(s) def translate(source: str, maker: WordMaker) -> tuple[Tag, list[str]]: """ Use the given WordMaker and transformation rules to translate a tagged sentence. .. todo:: - R1-a. See test_transform_rule - R1-b. See test_rule_pair, but reformat noun-determiner ``(NP ${n|suffix(d)})`` - R1-c. Special case with no determiner, inject generic "a". - R2. Stem the adverb to drop the adverbial suffix "-ly", ``(VP ${v|stem|tense(m)})`` - R3. Rewrite individual verbs with mode/tense as suffix. ``(VP (MD $m) (VP $v)) -> (VP ${v|stem|tense(m)})`` add tense prefix to verb - R4. Rewrite individual nouns with determiners ``(NP (DET $d) (N $n)) -> (NP ${n|suffix(d)})`` """ # Special case of no-determiner. rule1c = TransformRule("(S (NP $n) (VP $v $n2))", "(S (VP $v) (NP $n) (PP a $n2))") phrase = Tag.from_text(source) xform_s1 = rule1c.apply(phrase) words = (maker.word(w) for w in xform_s1.clean().split()) return xform_s1, list(words) def get_options(args: List[str] = sys.argv[1:]) -> argparse.Namespace: seed_algo = {'h': hash_seed, 'hash': hash_seed, 'n': naive_seed, 'naive': naive_seed} parser = argparse.ArgumentParser() parser.add_argument('-a', '--algorithm', type=str, action='store', default='hash') parser.add_argument('-g', '--generate', type=int, action='store', default=None, help="spew words") parser.add_argument('-w', '--words', type=int, action='store', default=None, help="translate words") parser.add_argument('-t', '--translate', type=str, action='store', default=None, help="translate a tagged phrase") options = parser.parse_args(args) try: options.algorithm = seed_algo[options.algorithm] except KeyError as ex: msg = "Alogorithm {} isn't known".format(ex) raise argparse.ArgumentTypeError(msg) return options def main() -> None: options = get_options() maker = WordMaker(options.algorithm) if options.generate is not None: for i in range(options.generate): print(maker.word()) if options.words is not None: for w in options.words.split(): print(w, maker.word(w)) if options.translate is not None: tagged, words = translate(options.translate, maker) print(options.translate, tagged.clean(), words ) if __name__ == "__main__": main()