-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrulegen.py
389 lines (324 loc) · 15.3 KB
/
rulegen.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
#!/usr/bin/env python3
"""Generate a random string according to given generation rules."""
# Copyright © 2015 Timothy Pederick.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# Standard library imports.
from collections import namedtuple
from contextlib import contextmanager
import csv
import os
import os.path
import random
import sqlite3
import types
# Local imports.
from ruleparser import (all_terminals, parse_rules, parse_terminals, Literal,
DBLookup)
# Enable boolean handling in SQLite.
sqlite3.register_adapter(bool, int)
sqlite3.register_converter('BOOLEAN', lambda dat: bool(int(dat)))
class Rulegen:
"""A rules-based random generator.
A random generator is initialised from two text files: a CSV file
containing words or word elements ("roots"), and a rules file
containing generation rules in the format defined by the ruleparser
module.
To generate strings, the generator connects to a SQLite database
file on disk (creating it from the above files first if it does not
exist). This database has two tables, called by default "Roots" and
"Results", corresponding to the CSV and rules files respectively.
(As a result of this setup, it is possible for a generator to
operate with only the database file.)
Class attributes:
The following attributes set some defaults for the names and
schemas of the database files. Override them in a subclass, or
by assigning instance attributes with the same names, to change
the default.
roots_table -- The name of the "Roots" table.
roots_idcol -- The column of the "Roots" table that holds a
unique row identifier.
results_table -- The name of the "Results" table.
results_idcol -- The column of the "Results" table that holds a
unique row identifier.
results_idcol -- The column of the "Roots" table that holds the
data.
idcol_type -- The SQLite type declaration applicable to the
above-mentioned *_idcol attributes.
Instance attributes:
data_prefix -- A default filename (minus the extension) to use
for all data files.
data_dir -- A directory name (absolute or relative) where the
data files are found by default.
csvfile, rulefile, dbfile -- The paths and filenames of the CSV,
rules, and database files.
rules -- The parse tree for the rules file, as produced by the
ruleparser module. Read-only.
"""
roots_table, roots_idcol = 'Roots', 'RootID'
results_table, results_idcol, results_datacol = ('Results', 'ResultID',
'Result')
idcol_type = 'INTEGER PRIMARY KEY AUTOINCREMENT'
def __init__(self, data_prefix, data_dir=None, csvfile=None, rulefile=None,
dbfile=None):
"""Initialise the generator.
Keyword arguments:
data_prefix -- As the instance attribute.
data_dir -- As the instance attribute. The default is the
current working directory.
csvfile, rulefile, dbfile -- As the instance attributes. The
defaults are built from the above two parameters, with
extensions ".csv", ".rules", and ".db", respectively.
"""
self.data_prefix = data_prefix
self.data_dir = os.getcwd() if data_dir is None else data_dir
(self.csvfile, self.rulefile,
self.dbfile) = ((os.path.join(self.data_dir, self.data_prefix + ext)
if file is None else file)
for file, ext in ((csvfile, '.csv'),
(rulefile, '.rules'),
(dbfile, '.db')))
self._rules = self._headings = self._seen_ids = None
@property
def rules(self):
if self._rules is None:
self._rules = parse_rules(self.rulefile)
return self._rules
def guess_type(self, heading):
"""Guess the SQLite type of a column based on its heading.
The guessed type is set by the generator's idcol_type attribute
if the heading is the same as either of the *_idcol attributes.
If not, but the column heading starts with "Is" followed by a
capital letter, the guessed type is suitable for Boolean data.
Otherwise, the guessed type is "TEXT".
Keyword arguments:
heading -- A string containing a column heading.
Returns:
A string containing a SQLite data type declaration.
"""
bool_prefix = 'Is'
return (self.idcol_type if heading in (self.roots_idcol,
self.results_idcol) else
'BOOLEAN NOT NULL'
if (heading.startswith(bool_prefix) and
heading[len(bool_prefix)].isupper()) else
'TEXT')
def headings(self, with_id=False, with_types=False, sep=', '):
"""Get the data column headings as a string.
Specifically, these are the columns of the generator's "Roots"
table. The "Results" table only has one data column, and its
heading is stored in the generator's results_datacol attribute.
Keyword arguments:
with_id -- True if the ID column of the table should be
included. The default is False.
with_types -- True if the output should include the data
type declaration for each column (suitable for e.g. a
SQL CREATE TABLE statement). The default is False.
sep -- The separator to use in the output string. The
default is a comma and space (", ").
Returns:
A string containing the column headings, separated by the
sep argument.
"""
if self._headings is None:
# Call the CSV reader so that self._headings is set.
self.read_csv()
assert self._headings is not None
headings = [self.roots_idcol] if with_id else []
headings.extend(self._headings)
return sep.join(repr(heading) +
(' ' + self.guess_type(heading) if with_types else '')
for heading in headings)
def read_csv(self):
"""Read data from the CSV data file.
Returns:
A list of named-tuple instances.
"""
with open(self.csvfile, encoding='utf-8') as file:
reader = csv.reader(file)
# Fetch the column headings from the first line. Don't include it
# in the output!
self._headings = next(reader)
# Stick each remaining line in a named tuple.
csv_format = namedtuple('csv_format', self._headings)
# Lazy evaluation of a generator causes problems when other
# methods need to use self._headings, so don't be a generator.
# Just return the results instead of yielding each one.
return list(csv_format(*row) for row in reader)
def build_db(self):
"""(Re)build the SQLite database."""
# Connect to the database file.
conn = sqlite3.connect(self.dbfile,
detect_types=sqlite3.PARSE_DECLTYPES)
try:
# We need the CSV reader later, so let's call it now so that
# self._headings is set.
csv_rows = self.read_csv()
cur = conn.cursor()
# Create or replace the tables.
cur.execute('DROP TABLE IF EXISTS {!r}'.format(self.roots_table))
cur.execute('DROP TABLE IF EXISTS {!r}'.format(self.results_table))
cur.execute('CREATE TABLE {!r} '
'({})'.format(self.roots_table,
self.headings(with_id=True,
with_types=True)))
cur.execute('CREATE TABLE {!r}'
' ({!r} {}'
', {!r} TEXT)'.format(self.results_table,
self.results_idcol,
self.idcol_type,
self.results_datacol))
# Read in the CSV data and insert it into the table.
cur.executemany('INSERT INTO {!r} ({})'
' VALUES ({})'.format(self.roots_table,
self.headings(),
', '.join('?' for _ in
self._headings)),
csv_rows)
# Parse the rules and insert each result format into the table.
cur.executemany('INSERT INTO {!r} ({})'
' VALUES (?)'.format(self.results_table,
self.results_datacol),
# Stick each terminal sequence in a one-item tuple
# to stop the string being interpreted as a
# sequence of data values.
((result,) for result in
all_terminals(self.rules)))
conn.commit()
finally:
conn.close()
def get_data(self, colname, table=None, idcol=None):
"""Get one random value from the database.
Internally, the generator's _seen_ids attribute is used to
ensure that successive calls to get_data() do not repeat rows.
This private attribute is set (and reset) by the generate()
method.
Keyword arguments:
colname -- The database column name from which data is to be
retrieved.
table -- The name of the table (in this generator's database
file) from which data is to be retrieved. The default is
the generator's roots_table attribute.
idcol -- The database column name that stores a unique
identifier for each row. The default is the generator's
roots_idcol attribute (or results_idcol, if the table
argument is supplied and is equal to the results_table
attribute).
Returns:
A string.
"""
if not os.path.isfile(self.dbfile):
self.build_db()
if table is None:
table = self.roots_table
if idcol is None:
idcol = (self.results_idcol if table == self.results_table else
self.roots_idcol)
conn = sqlite3.connect(self.dbfile,
detect_types=sqlite3.PARSE_DECLTYPES)
try:
cur = conn.cursor()
# Build a WHERE clause that avoids repeats.
values = []
select_addendum = where_addenda = ''
if self._seen_ids is not None:
select_addendum = ', t.{!r}'.format(idcol)
avoid_this = ' AND t.{!r} != ?'.format(idcol)
avoids = []
for seen_id in self._seen_ids:
avoids.append(avoid_this)
values.append(seen_id)
if len(avoids) > 0:
where_addenda = ''.join(avoids)
query = ('SELECT t.{1!r}{2}'
' FROM {0!r} t'
' WHERE t.{1!r} IS NOT NULL'
' AND t.{1!r} != ""{3}'
' ORDER BY random()'
' LIMIT 1'.format(table, colname,
select_addendum, where_addenda))
cur.execute(query, values)
row = cur.fetchone()
if self._seen_ids is not None:
self._seen_ids.add(row[1])
# We're finished with the connection... but don't commit, because
# nothing (should have) changed!
return row[0]
finally:
conn.close()
def generate(self):
"""Generate a random string according to the generator rules.
Returns:
A string.
"""
# Select a random output format. Don't save seen data rows, yet.
self._seen_ids = None
fmt = self.get_data(self.results_datacol, self.results_table,
self.results_idcol)
# This is a list of 2-tuples. The first element of each is the text to
# add to the string; the second is None when the text came from a
# string literal, or the name of the column when it came from a
# database lookup.
result = []
# Start saving seen data rows.
self._seen_ids = set()
# Split the format into a sequence of database lookups and string
# literals.
for token in parse_terminals(fmt):
# Push string literals straight to output, but grab a random element
# from the database for each database lookup.
if isinstance(token, Literal):
if token.content != '':
result.append((token.content, None))
else:
assert isinstance(token, DBLookup)
result.append((self.get_data(token.content), token.content))
# Apply any post-processing.
self.postprocess(result)
return ''.join(text for text, _ in result)
def postprocess(self, result):
"""Apply generator-specific processing to generated output.
Keyword arguments:
result -- A list (or other mutable sequence) of 2-tuples.
Each tuple contains a string and a column name (or None
if the string came from a literal).
Returns:
None (the result argument is modified in-place).
"""
return
# Included generators.
academia = Rulegen('academia', 'Academia')
def _academia_postprocess(self, result):
"""Delete doubled letters and drop O suffixes where necessary."""
if not os.path.isfile(self.dbfile):
self.build_db()
previous_end = None
for n, (text, colname) in enumerate(result):
changed = False
# Does this string duplicate the last letter of the previous?
if text[0] == previous_end:
changed = True
text = text[1:]
# Does the next string make this one drop an O?
if text[-1] == 'o' and n + 1 < len(result):
next_text, _ = result[n + 1]
if next_text[0] in 'aeiou':
changed = True
text = text[:-1]
# Were any changes made?
if changed:
result[n] = (text, colname)
previous_end = text[-1]
academia.postprocess = types.MethodType(_academia_postprocess, academia)
technobabble = Rulegen('technobabble', 'Technobabble')