from string import punctuation, whitespace

def some_func(x):
    """
    >>> some_func("hello world") = {'d', 'e', 'h', 'l', 'o', 'r', 'w'}
    """
    return set(x) - (set(punctuation) | set(whitespace))

factorial(5)

120.0

factorial(4.5)

52.34277778455352

factorial(-0.5)**2

3.1415926535897927

import datetime
from typing import Annotated
from pydantic import BaseModel, Field

class Employee(BaseModel):
    id: int
    name: Annotated[
        str, Field(max_length=40)]
    hire_date: Annotated[
        datetime.datetime, Field(ge=datetime.datetime(2021, 1, 18))]
    velocity: Annotated[
        float, Field(ge=2, le=21, 
                     json_schema_extra={"distribution": "Normal"})]
    manager: Annotated[
        int, Field(json_schema_extra={"sql": {'foreignKey': 'Manager.id'}})]

from pprint import pprint
pprint(Employee.model_json_schema())

{'properties': {'hire_date': {'format': 'date-time',
                              'title': 'Hire Date',
                              'type': 'string'},
                'id': {'title': 'Id', 'type': 'integer'},
                'manager': {'sql': {'foreignKey': 'Manager.id'},
                            'title': 'Manager',
                            'type': 'integer'},
                'name': {'maxLength': 40, 'title': 'Name', 'type': 'string'},
                'velocity': {'distribution': 'Normal',
                             'maximum': 21.0,
                             'minimum': 2.0,
                             'title': 'Velocity',
                             'type': 'number'}},
 'required': ['id', 'name', 'hire_date', 'velocity', 'manager'],
 'title': 'Employee',
 'type': 'object'}

from typing import Any
from pydantic.fields import FieldInfo
from annotated_types import MaxLen, Ge, Le
from operator import attrgetter

def field_generate(field: FieldInfo) -> Any:
    if issubclass(field.annotation, str):
        size = get_meta(MaxLen, attrgetter("max_length"), field)
        print("String", size, field.metadata, field.json_schema_extra)
        return "String"
    elif issubclass(field.annotation, int):
        min_value = get_meta(Ge, attrgetter("ge"), field)
        max_value = get_meta(Le, attrgetter("le"), field)
        print("Integer", min_value, max_value, field.metadata, field.json_schema_extra)
        return min_value if min_value is not None else 0 
    elif issubclass(field.annotation, datetime.datetime):
        default = datetime.datetime(1970, 1, 1)
        min_value = get_meta(Ge, attrgetter("ge"), field)
        print("Date", min_value, field.metadata, field.json_schema_extra)
        return min_value or default
    elif issubclass(field.annotation, float):
        min_value = get_meta(Ge, attrgetter("ge"), field)
        max_value = get_meta(Le, attrgetter("le"), field)
        print("Float", min_value, max_value, field.metadata, field.json_schema_extra)
        return min_value if min_value is not None else 0.0
    else:
        raise ValueError(f"unsupported {field.annotation=} {field.metadata=} {field.json_schema_extra=}")

def row_generate(cls_: type[BaseModel]) -> BaseModel:
    data = {
        name: field_generate(field)
        for name, field in cls_.model_fields.items()
    }
    # Derived values (2NF and 3NF violations.)
    return cls_(**data)

row_generate(Employee)

Integer None None [] None
String 40 [MaxLen(max_length=40)] None
Date 2021-01-18 00:00:00 [Ge(ge=datetime.datetime(2021, 1, 18, 0, 0))] None
Float 2 21 [Ge(ge=2), Le(le=21)] {'distribution': 'Normal'}
Integer None None [] {'sql': {'foreignKey': 'Manager.id'}}

Employee(id=0, name='String', hire_date=datetime.datetime(2021, 1, 18, 0, 0), velocity=2.0, manager=0)

def SQLSchema(some_class):
    js = some_class.model_json_schema()
    print(f"CREATE IF NOT EXISTS TABLE {js['title']} (")
    for name, p in js['properties'].items():
        print(f"  {name} {p['type'].upper()},")
    print(")")

SQLSchema(Employee)

CREATE IF NOT EXISTS TABLE Employee (
  id INTEGER,
  name STRING,
  hire_date STRING,
  velocity NUMBER,
  manager INTEGER,
)

Synthetic Data¶

About Me¶

Agenda¶

Part 1. What do you mean Synthetic?¶

Test Cases¶

But...¶

Part 2. Why Bother?¶

Question of Scale¶

Boundary Value Analysis¶

Unfair!¶

Better Examples¶

Consider¶

Common Approach¶

Better Approach¶

Impediments¶

Part 3. Some tooling for creating useful synthetic data¶

Schema Definition¶

Starting Point -- Design¶

Refine to Pydantic Model¶

Create JSONSchema from class definition¶

Data Generation¶

Build Field Values¶

Build Objects¶

Serialize as needed¶

Make SQL Table Definitions (if you must)¶

Part 4. The Cool Stuff¶

Strings¶

Numbers (int, float, currency, durations)¶

Dates¶

Enumerated Values¶

Benford's Law¶

Foreign Keys¶

Optional Values¶

There's Still More¶

Conclusion¶

Implementation¶