-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathgenerate.py
executable file
·1772 lines (1500 loc) · 83.6 KB
/
generate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/LibidoMechanica/bin/python3
# -*- coding: utf-8 -*-
# cython: language_level=3
"""generate.py creates the content at LibidoMechanica.tumblr.com, which is a
blog consisting of automatically written "love poetry" created by this script.
This program is copyright 2017-19 by Patrick Mooney.
Usage:
./generate.py [options]
Options:
--help, -h Print this help text, then exit.
--build, -b Fully populate the textual similarity cache, then exit.
--clean, -c Clean out stale data from the textual similarity cache,
then exit.
Only one of the above options may be specified; if one is, the specified task
is performed, then the program quits. If no options are given on the command
line, the program writes and posts a poem, then quits.
In a nutshell, this program "writes" these "love poems" by training a Markov
chain-based text generator on a set of existing love poems (a phrase sometimes
rather broadly interpreted) picked from a larger corpus of love and romance
poetry in English. This corpus is itself a work in progress and has known
problems: as of this writing (1 July 2018), for instance, it disproportionately
emphasizes canonical British poets from the 15th to the 18th centuries and
under-represents poems that (for instance) were written in languages other than
English; were written by colonial subjects; were written by working-class
writers; etc. etc. etc. SOME attention (though not enough) has been paid to
such representational matters, and diversifying the corpus of training texts in
many ways is a long-term goal for this project.
The text generator used is a modification of my own Markov chain-based text
generator, forked from Harry R. Schwartz's text generator and heavily modified.
This particular version of the text generator treats individual characters as
the tokens that are mapped, rather than whole words, as my other text-
generation projects do. The generator is trained on a variable series of texts
selected from the corpus, each of which bears SOME computable "similarity" to
another text already selected -- either a seed text from the beginning or
another text previously selected in the same manner. The details and thresholds
for the selection algorithm are currently (25 July 2018) being tweaked quite
regularly.
Once the poem is generated by the text generator, it is modified in various
ways to make it look less like it was generated by an algorithm that was merely
traversing a series of Markov chains randomly. Some of these tasks are:
* attempting to balance opening and closing punctuation (#FIXME: not good enough yet.)
* curlifying quotes
* preventing sentence-ending punctuation from beginning a line
* stripping spurious line breaks
* attempting to normalize (according to any of several definitions) stanza
length.
* attempting to regularize the number of syllables in a line, according to
any of several methodologies.
That was not a comprehensive list.
Once the poem is cleaned and otherwise "edited," it is posted to Tumblr, and an
archival copy is posted to the archives/ folder inside the project's folder.
This "archival copy" is a bzipped JSON file recording the poem itself, the
texts used to train the text generator that generated it, and some other info
about the poem's generation. The archives/ folder is periodically cleaned by
hand into a series of folders each containing only 1000 archived poems.
The training-text selection algorithm evaluates the "similarity" between
existing members of the list and candidates for addition to it. This is a
fairly expensive calculation to make, especially when at least one of the texts
being compared is long, and so the results of the calculation are cached
between runs in a global similarity cache. This cache is opened, used and
modified, and then updated on disk when the source text selections have been
made. There are several BasicSimilarityCache classes, though only one is actually
used by the current setup: older ones still exist in case any cache files
created by them need to be read. All are singleton classes (and more recent
ones make some attempt to enforce this, or at least to protect against
erroneous spurious creations). Creating one automatically reads the cache
into memory as an attribute of the object being instantiated. Normally, this is
probably best done with the convenience wrapper open_cache(), which is a
context manager that ensures the cache is written back to disk when it is done
being used (and has quite likely been modified). The convenience function
clean_cache() cleans stale data out of the cache; the convenience function
build_cache() forces it to be fully populated with results for all texts in the
training corpus (and takes a REALLY LONG TIME to run if the cache has not
already been populated).
This whole script is very much a rough draft and a work in progress. Many of
the sub-tasks that this script accomplishes are accomplished in hacky and
suboptimal ways. There's plenty of room for improvement here.
THIS SOFTWARE IS OFFERED WITHOUT WARRANTY OF ANY KIND AT ALL. It is ALPHA
SOFTWARE; if you don't know what that means, or can't read the source code to
determine whether it meets your needs, then this software is not intended for
you.
Nevertheless, if you want to adapt it, this script is licensed under the GNU
GPL, either version 3, or (at your option) any later version; see the file
LICENSE.md for more details.
"""
# Current list of things so annoying that they're likely to get priority in fixing:
# * Deepening and diversifying the corpus is always a goal.
# * Something is occasionally truncating poems early. --FIXED?
# * We should syllabify the entire source corpus, keeping a list of which words are manually syllabified, then
# check to see if they're syllabified correctly, and keep a second dictionary to use in addition to the CMU
# corpus.
# * There are other ideas for how to judge source-text similarity:
# * (approximate) year of composition
# * geographical nearness
# * various author characteristics
# * just picking multiple poems from a single author would be a good move with authors who have a sufficient
# number of poems in the corpus.
# * all of these would require manual metadata entry. Oh boy, another pickled dictionary or something.
# * Tokenizing currently drops leading space, which shouldn't happen, actually. -- NO LONGER CONVINCED THIS IS TRUE
# * We should have CAPITALIZATION NORMALIZATION goin' on. In the output, I mean.
# * Also other formal things: patterns of leading space, e.g.
# * When a poem title needs to be shortened, the current algorithm simply lops off a random number of tokens until the
# phrase is short enough. This works sometimes, but also produces titles that, say, end with conjunctions an
# unpleasant amount of the time. It would be smarter to generate a parse tree for the relevant sentence,
# then grab an appropriate-length branch (or branches) from it.
# * There is of course always parameter tweaking. Documentation improvements, too.
# * We should try harder to avoid producing poems with a prime number of lines.
# * Come right down to it, we should also try harder to avoid producing poems with a PRIME-LIKE number of syllables.
# (By which I mean: no USEFUL factors in the number. It's surprising how many poems are generated with a total
# number of syllables that has no factors that are plausible poetic line lengths.)
# * The directory structure needs reworking, and this module needs to be split into smaller files. --WORKING
# * This probably implies a utils/ folder for secondary scripts, like check_corpus.py.
# * There will almost certainly be others.
# * There's still trouble with intra-word apostrophes.
# * Same deal with leading apostrophes in archaic contractions with an initial dropped-letters apostrophe ("'tis")
# * Probably best dealt with by preprocessing and using something apostrophe-like in the source texts.
# * "Is it an opening or a closing quote?" gets it wrong when:
# * there is a previous non-alphabetic (or -alphanumeric?) character:
# * e.g., em dash-apostrophe-capital letter should turn the apostrophe into an opening single quote, but gets it wrong
import bz2
import collections
import copy
import datetime
import functools
import json
import numbers
from pathlib import Path
import pprint
import random
import re
import subprocess
import sys
import typing
import unicodedata
import pid # https://pypi.python.org/pypi/pid/
from num2words import num2words # https://pypi.org/project/num2words/
from nltk.corpus import cmudict # nltk.org
import pyximport; pyximport.install() # http://cython.org
import patrick_logger # https://github.com/patrick-brian-mooney/personal-library
from patrick_logger import log_it
import file_utils as fu # https://github.com/patrick-brian-mooney/python-personal-library/
import social_media # https://github.com/patrick-brian-mooney/personal-library
from social_media_auth import libidomechanica_client # Unshared file that contains authentication tokens.
import text_generator as tg # https://github.com/patrick-brian-mooney/markov-sentence-generator
import poetry_generator as pg # https://github.com/patrick-brian-mooney/markov-sentence-generator
import text_handling as th # https://github.com/patrick-brian-mooney/personal-library
import check_capitalization as cc # https://github.com/patrick-brian-mooney/python-personal-library
from globs import * # Filesystem structure, etc.
import similarity_cache as sc # Cache of calculated textual similarities.
patrick_logger.verbosity_level = 3
manually_check_capitalization = False
base_dir = Path(__file__).parent
cache_dir = base_dir / 'cache'
syllables_cache = cache_dir / 'syllables.json'
archive_dir = base_dir / 'archives'
max_archives_in_sub_dir = 1000
# Load the supplemental syllabification data.
# FIXME: this spplemental data is never saved!
try:
more_syllable_data = json.loads(syllables_cache.read_text(encoding='utf-8'))
except (json.JSONDecodeError, IOError):
more_syllable_data = {
'corrected': dict(),
'validated': dict(),
'computed': dict(),
}
more_syllable_data['all'] = collections.ChainMap(more_syllable_data['corrected'], more_syllable_data['validated'],
more_syllable_data['computed'])
unused_lines = collections.deque()
# Some language-related constants.
punct_with_no_space_before = """.?,;!:#-‐‑‒–—―%&)*+/@]^_}""".strip()
punct_with_no_space_after = """-‐‑‒–—―&(+/<=>@[_`{~""".strip()
opening_bracketers = ( "‘", '“', '(', '[', '{', )
closing_bracketers = ( "’", '”', ')', ']', '}', )
English_straight_single_quote = "'"
English_straight_double_quote = '"'
prohibs = [
'@', # email addresses are hard to syllabify & look ugly in poetry.
'_', # reject underlines, mostly because they are often usernames
'http://', 'https://', 'www', # URLs occasionally appear, but let's reject them too.
''.join([chr(i) for i in [99, 117, 110, 116]]), # the C word
''.join([chr(i) for i in [116, 119, 97, 116]]), # another slur for women, with a similar meaning
''.join([chr(i) for i in [102, 97, 103]]), # the 3-letter F-word
]
phoneme_dict = dict(cmudict.entries())
NEWLINE = "\n"
def get_blank_post_data() -> typing.Dict:
return {
'tags': ['poetry', 'automatically generated text', 'Patrick Mooney', 'Markov chains'],
'normalization_strategy': None,
'syllabic_normalization_strategy': None,'stanza length': None,
}
# This next is a global dictionary holding data to be archived at the end of the run. Modified constantly.
post_data = get_blank_post_data()
# Data about poetic form
poem_defaults = {
'max stanzas': 20, # FIXME: for now.
'min stanzas': 1,
'indent pattern': [0,]
}
poem_forms = {
'ballad':
[
{ # pick one of these numbers: how long will a stanza be, in lines?
'stanza length': [4, 4, 4, 4, 4, 4, 4, 4, 6, 8, 8,],
# possible numbers of syllables in a foot:
'syllables in foot': [2, 2, 2, 2, 2, 2, 2, 3],
# METRICAL FEET, not syllables, per line.
# [4, 3] means a 4-foot line is followed by a 3-foot line, & this pattern repeats through the stanza.
'meter pattern': [4, 3],
# Number of spaces before each line in the poem.
# Cycles when exhausted. Need not match up with stanza length.
'indent pattern': [0, 4],
'form name': 'ballad',
},
],
'sonnet':
[
{'stanza length': [14,],
'syllables in foot': [2, 2, 2, 2, 2, 2, 3,], # iambic pentameter is most common form
# next: allow occasional tetrameter sonnet, tho most times the program runs "sonnet" means "pentameter".
'meter pattern': [ random.choice([4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, ]) ],
'max stanzas': 1,
'form name': 'sonnet',
},
],
'Meredith sonnet':
[
{'stanza length': [16,],
'syllables in foot': [2,],
'meter pattern': [5,],
'max stanzas': 1,
'indent pattern': [0, 2, 2, 0,],
'form name': 'Meredith sonnet',
}
],
'limerick':
[
{'stanza length': [5,],
'syllables in foot': [3,], # Anapestic trimeter, trimeter, dimeter, ..
'meter pattern': [3, 3, 2, 2, 3], # ... dimeter, trimeter.
'max stanzas': 1,
'indent pattern': [0, 0, 3, 3, 0,],
'form name': 'limerick',
},
],
'cinquain':
[
{'stanza length': [5,],
'syllables in foot': [1,],
'meter pattern': [2, 4, 6, 8, 2],
'max stanzas': 1,
'form name': 'cinquain',
},
],
'curtal sonnet':
[
{'stanza length': [11,],
'syllables in foot': [2,], # Iambic pentameter
'meter pattern': [5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1], # Last line is actually a single spondee, but we don't yet have a way to represent meter.
'max stanzas': 1,
'form name': 'curtal sonnet',
},
],
'kimo':
[
{'stanza length': [3,],
'syllables in foot': [1,],
'meter pattern': [10, 7, 6],
'max stanzas': 1,
'form name': 'kimo',
}
],
'Kelly lune':
[
{'stanza length': [3,],
'syllables in foot': [1,],
'meter pattern': [5, 3, 5],
'max stanzas': 1,
'form name': 'Kelly lune',
},
],
'rispetto':
[
{'stanza length': [4,], # version 1: 2 iambic tetrameter quatrains
'syllables in foot': [2,],
'meter pattern': [4,],
'min stanzas': 2,
'max stanzas': 2,
'form name': 'rispetto',
},
{'stanza length': [8,], # version 2: 1 stanza, 8 hendecasyllabic lines
'syllables in foot': [1,],
'meter pattern': [11,],
'max stanzas': 1,
'form name': 'rispetto',
},
],
'tanka':
[
{'stanza length': [5,],
'syllables in foot': [1,],
'meter pattern': [5, 7, 5, 7, 7,],
'max stanzas': 1,
'form name': 'tanka',
},
],
'treochair':
[
{'stanza length': [3,],
'syllables in foot': [1,],
'meter pattern': [3, 7, 7],
'form name': 'treochair',
},
],
'tricube':
[
{'stanza length': [3,],
'syllables in foot': [1,],
'meter pattern': [3,],
'max stanzas': 3,
'min stanzas': 3,
'form name': 'tricube',
},
],
}
# Now perform a crude adjustment to the frequency with which different forms are selected for writing.
# We do this by inflating the number of keys that lead to a particular structure record, so for instance the sonnet
# form is referenced again via the key names sonnet1, sonnet2, ... sonnetN. This works because form is picked
# at random from all keys
for (form_name, multiplier) in (
('ballad', 10),
('sonnet', 15),
('Meredith sonnet', 3),
('limerick', 4),
('curtal sonnet', 3),
('rispetto', 2)):
for i in range(1, 1+multiplier):
poem_forms[f"{form_name}{i}"] = poem_forms[form_name]
# Next, some general utility functions.
def print_usage(exit_code: int = 0) -> None:
"""Print the docstring as a usage message to stdout, then quit with status code
EXIT_CODE.
"""
log_it("INFO: print_usage() was called", 4)
print(__doc__)
sys.exit(exit_code)
def get_last_git_commit_id() -> str:
"""Return the commit ID for the most recent Git commit.
#FIXME! This is kind of hacky.
"""
cmd = """git log -1 | grep ^commit | cut -d " " -f 2"""
return subprocess.run(cmd, shell=True, stderr=subprocess.STDOUT, stdout=subprocess.PIPE, universal_newlines=True).stdout.strip()
def get_last_script_change_date() -> str:
"""Return the date and time this script -- this one, that you are reading right now
-- was last modified.
"""
epoch_time = Path(__file__).stat().st_mtime
return datetime.datetime.fromtimestamp(epoch_time).isoformat(" ")
def get_script_version_info() -> typing.Dict[str, typing.Dict[str, str]]:
"""Retuns a dictionary with information about the current script version.
"""
return {
'LibidoMechanica script version': {
'generate.py commit ID': get_last_git_commit_id(),
'generate.py last modification time': get_last_script_change_date(),
}
}
# Some numeric utilities.
@functools.lru_cache(maxsize=256)
def eratosthenes_sieve(upper_limit: int = 101) -> typing.List[int]:
"""Generate a list of primes greater than zero but no larger than UPPER_LIMIT,
using the Sieve of Eratosthenes.
"""
ret = set(range(2, 1 + upper_limit))
for i in range(2, 1 + upper_limit):
if i in ret: # If I is currently in ret, we've hit a prime, i.e. a number that hasn't yet been discarded.
for j in range(2, 1 + upper_limit//i): # When J=1, I*J is a prime number. Don't discard it.
ret.discard(j * i) # Discard all multiples of J less than or equal to UPPER_LIMIT.
return sorted(ret)
@functools.lru_cache(maxsize=128)
def might_be_an_int(what: typing.Any) -> bool:
"""Return True if WHAT can be coerced to an integer, or False otherwise.
Returns True, not an integer.
"""
try:
_ = int(what)
return True
except (TypeError, ValueError,):
return False
# Sequence-related utilities.
def _flatten_list(l: typing.Iterable) -> typing.Generator[object, None, None]:
"""Regardless of how deep the list L is, return a list that has the non-list atoms
that compose the list L. If L contains any lists, the returned list will contain
the ELEMENTS of those sublists, rather than the sublists themselves. No matter
how deeply nested L is, the returned list will not contain any lists, but only
the atoms of those lists.
Note that this actually returns a generator expression, not a list, and so
using the non-underscore convenience wrapper below might be a good idea
sometimes.
"""
assert isinstance(l, collections.abc.Iterable)
for elem in l:
if isinstance(elem, collections.abc.Iterable) and not isinstance(elem, (str, bytes)):
for sub in _flatten_list(elem):
yield sub
else:
yield elem
def flatten_list(l: typing.Iterable) -> typing.List[object]:
"""Convenience wrapper for _flatten_list(), above. Returns an actual list, which
is guaranteed to contain no other lists.
"""
return list(_flatten_list(l))
def alternating_sequence(min_num: int,
max_num: int) -> typing.Sequence[int]:
"""Takes the sequence of integers
[ MIN, MIN + 1, MIN + 2 ... MAX - 2, MAX - 1, MAX ]
and reorders it to
[ MIN, MAX, MIN + 1, MAX - 1, MIN + 2, MAX - 2, ... ]
Note that MAX is the actual highest member in the range, not the integer past
the highest member. This is not a half-open interval.
"""
working, ret, first = list(range(min_num, max_num + 1)), list(), True
while working:
ret.append(working.pop(0 if (first) else -1))
first = not first
return ret
@functools.lru_cache(maxsize=1024)
def bin_fit(options: typing.Iterable[numbers.Number],
goal: typing.Union[int, float]) -> typing.Union[typing.List[numbers.Number], None]:
"""Given OPTIONS, a list of numeric values, tries to find a combination of values
that add up to GOAL. If it finds such a list, returns it. Otherwise, returns
None.
Returns the first solution found, which means in practice that it prefers to
find a list containing a few large numbers rather than many small ones. This
particular choice was made specifically because it's best to compose a poem from
fewer comparatively long sentences than to compose a poem from many very short
ones.
Calls itself recursively, but this is not likely to be a problem unless we start
dealing with much longer lists than tests have so far managed to generate. This
function is mostly (only?) called while trying to decide whether the list of the
numbers of syllables in the cache of unused lines can be used to fill up the
remaining syllables needed in a particular poem, and so far this list has not
grown large enough to produce deeper recursion in this function.
When recursing, we do so only from a loop iterating over remaining options from
greatest to least, and only by allowing the sub-problem to work on lists of
numbers smaller than or equal to the number we're currently considering. This
helps to keep the problem scope manageable. In any case, ordering the numbers
and reducing the problem this way works because sum([a, b, c]) is the same as
sum([c, a, b]) or other permutations, because addition is commutative. So order
doesn't matter, and we can pick an order that narrows the problem scope in this
way.
"""
if not options:
if goal == 0:
return options
else:
return None
if sum(options) == goal:
return options
else:
current_opts = sorted([i for i in options if i <= goal], reverse=True)
for i, current in enumerate(current_opts):
if 1 + 1 == len(current_opts): # At the end of the list? No solution here.
return None
if current == goal:
return [current]
else:
smaller_bin = bin_fit(tuple(current_opts[1 + i:]), (goal - current))
if smaller_bin:
return [current] + list(smaller_bin)
# Some text-related utilities.
@functools.lru_cache(maxsize=None)
def manually_count_syllables(word: str) -> int:
"""Clearly not perfect, but better than nothing.
#FIXME: we should be keeping an additional list for words not in cmudict.
Based on https://datascience.stackexchange.com/a/24865.
"""
count = 0
vowels = 'aeæiouy'
word = unicodedata.normalize('NFKD', word.lower()).encode('ASCII', 'ignore').decode('ASCII') # strip diacritics
if len(word) == 0: return 0 # Naively assume that null words have no syllables.
if word[0] in vowels:
count +=1
for index in range(1, len(word)):
if word[index] in vowels and word[index-1] not in vowels:
count +=1
if word.endswith('e'):
count -= 1
if word.endswith('le'):
count += 1
if count == 0:
count += 1
return count
def int_to_roman(input: int) -> str:
""" Convert an integer to a Roman numeral. This was blatantly stolen (with small
adaptations) from the O'Reilly Python Cookbook:
https://www.oreilly.com/library/view/python-cookbook/0596001673/ch03s24.html.
"""
if not isinstance(input, type(1)):
raise TypeError("expected integer, got %s" % type(input))
if not 0 < input < 4000:
raise ValueError("Argument must be between 1 and 3999")
ints = (1000, 900, 500, 400, 100, 90, 50, 40, 10, 9, 5, 4, 1)
nums = ('M', 'CM', 'D', 'CD', 'C', 'XC', 'L', 'XL', 'X', 'IX', 'V', 'IV', 'I')
result = []
for i in range(len(ints)):
count = int(input / ints[i])
result.append(nums[i] * count)
input -= ints[i] * count
return ''.join(result)
def ordinal_description(number: int,
capitalize_first: bool = False,
negative_indicator: str = "negative") -> str:
"""Generates an English ordinal description of NUMBER. For instance, if NUMBER is
3, this function returns "third". If CAPITALIZE_FIRST is True, capitalizes the
first letter. If NEGATIVE_INDICATOR is specified, it is prepended to negative
numbers to indicate their negativity. (Arguably, allowing negative ordinals is
a violation of the mathematical notion of ordinality, but practically, it's
sometimes useful.)
"""
assert isinstance(number, int)
ret = num2words(number if (number >= 0) else -number, to='ordinal').strip()
if number < 0:
ret = f"{negative_indicator.strip()} {ret}"
if capitalize_first:
ret = ret[0].upper() + ret[1:]
return ret
def unicode_of(what: typing.Union[str, bytes]) -> str:
"""Just force WHAT to be a Unicode string, as much as we can possibly automate
that. We start by using the system default, then trying UTF-8, and then, if
either or both is installed, tries UnicodeDammit and chardet. If all else fails,
decodes to Latin-1, which should always not fail although it may munge data. If
even *that* doesn't work or some unknown godawful reason, the error will
propagate upwards.
This comes in handy when interacting with external programs, which may just barf
up data without caring about encoding.
"""
try:
what = what.decode()
except Exception:
try:
what = what.decode('utf-8')
except Exception:
try:
from bs4 import UnicodeDammit # https://www.crummy.com/software/BeautifulSoup/bs4/doc/
return UnicodeDammit(what).unicode_markup
except Exception:
try:
import chardet
char_info = chardet.detect(what)
what = what.decode(char_info['encoding'])
except Exception:
what = what.decode('latin-1')
return what
def strip_invalid_chars(the_poem: str) -> str:
"""Some characters appear in the training texts but are characters that, I am
declaring by fiat, should not make it into the final generated poems at all.
The underscore is a good example of characters in this class. This function
takes an entire poem as input (THE_POEM) and returns a poem entirely
stripped of all such characters.
"""
log_it("INFO: stripping invalid characters from the text", 2)
invalids = ['_', '*']
return ''.join([s for s in the_poem if not s in invalids])
@functools.lru_cache(maxsize=8192)
def strip_diacritics(the_input: str) -> str:
"""Strip diacritical marks and convert to ASCII text by decomposing the Unicode
string passed in, then removing non-spacing marks.
"""
return ''.join(the_character for the_character in unicodedata.normalize('NFKD', the_input) if unicodedata.category(the_character) != 'Mn')
@functools.lru_cache(maxsize=2048)
def normalize_text(the_text: str,
lowercase: bool = False) -> str:
"""Get the "comparison" or "normalized" form of a piece of text. This strips off all
unicode accents and, if LOWERCASE is True, also casefolds the text.
"""
if lowercase:
return strip_diacritics(the_text).casefold()
else:
return strip_diacritics(the_text)
def supplemental_syllable_count(word: str) -> int:
"""Get the manual count of syllables for words not in CMUDict. If the syllables in
this particular word have never before been counted, then we defer to
manually_count_syllables(), above, to do so, then cache the resulting
calculation as a provisional, to-be-validated-by-a-human number, using it for
now but keeping it on the provisional list (that's
caches.more_syllable_data['computed']). When validated by a human, it moves to
caches.more_syllable_data['validated']. Both are accessible in a ChainMap, so
either places is fine, though of course reading the ChainMap gives 'validated'
values before 'computed' values if there is a conflict.
If the number of syllables in this word HAS been computed before, then return
that number, whether it's ('just') 'computed' or it's 'validated.'
Syllable counts can be moved from 'computed" to 'validated' using the function
cache.validate_syllables(), which can be called by running caches.py with the
appropriate switch. See its documentation for details.
"""
w = normalize_text(word, True)
if w in more_syllable_data['all']:
return more_syllable_data['all'][w]
else:
more_syllable_data['computed'][w] = manually_count_syllables(w)
return more_syllable_data['all'][w]
@functools.lru_cache(maxsize=4096)
def syllables_in_word(word: str) -> int:
"""Do a reasonably good job of determining the number of syllables in WORD, a word
in English. Uses the CMU corpus if it contains the word, or a best-guess
approach otherwise. Based on https://stackoverflow.com/a/4103234.
"""
assert isinstance(word, str)
w = normalize_text(''.join([c for c in word if c.isalpha()]), True)
try:
return len([ph for ph in phoneme_dict[w] if ph.strip(string.ascii_letters)])
except (KeyError,):
return supplemental_syllable_count(w)
# FIXME: We need to save computed syllabic data for human verification!
@functools.lru_cache(maxsize=1024)
def syllables_in_sentence(sentence: str,
genny: typing.Type[tg.TextGenerator]) -> int:
"""Get the total number of syllables in a sentence. Requires a trained GENNY to
split the sentence into tokens.
"""
ret = 0
for token in genny._token_list(sentence, character_tokens=False):
ret += syllables_in_word(token)
return ret
def appropriate_quote(quote_list: typing.Iterable[str],
quote_level: int,
standard_english_quotes: bool = False) -> str:
"""Returns the appropriate quotation mark from QUOTE_LIST, which is a list of
(opening or closing) quotation marks. The quotation mark is chosen to be the
correct mark to open or close a quote at QUOTE_LEVEL levels of nesting. If
STANDARD_ENGLISH_QUOTES is False (the default), then American-style rules for
quotes are used (double, then single, then double, then single, then double,
then single ...); otherwise, Standard English rules (single, double, single ...)
are used.
This function makes a number of assumptions about the structure of QUOTE_LIST:
* it must contain at least two strings.
* the first string must be a single quote.
* the second string must be a double quote.
* any other strings in the list are ignored and not used at all by this
function, though they may be useful to other bits of code in this
project.
Here's a quick summary of the first few levels of nesting, for reference, and
assuming that the lists passed are the global constants open_quotes and
close_quotes:
Standard English American English
level opening closing opening closing
----- ------- ------- ------- -------
1 ‘ ’ “ ”
2 “ ” ‘ ’
3 ‘ ’ “ ”
4 “ ” ‘ ’
5 ‘ ’ “ ”
etc.
"""
return quote_list[((0 if standard_english_quotes else 1) + quote_level - 1) % 2]
def normalize_quotes(the_poem: str,
standard_english_quotes: bool = False) -> str:
"""Takes THE_POEM, a string representing an entire poem, and makes sure that
quotation marks are used "correctly": not only are there the same number of
opening and closing quotation marks, but they are properly nested, with single
and double quotes alternating properly, and opening quotes always preceding
closing quotes in an appropriate fashion.
As with much of the other quote-handling code in this project, assumes that
quotation marks are single characters.
Returns a string, which is the entire modified poem.
#FIXME: currently, STANDARD_ENGLISH_QUOTES is *never* True. Should it be?
"""
ret = ""
quote_depth = 0
for i, c in enumerate(the_poem):
if c in open_quotes:
if i < (len(the_poem)-1): # Special-case 'tis, 'twas, 'gainst, etc.
context = the_poem[i+1:]
for which_exception in words_with_initial_apostrophes:
if context.startswith(which_exception) and context[len(which_exception)].isspace():
ret += close_quotes[0]
continue
quote_depth += 1
ret += appropriate_quote(open_quotes, quote_depth, standard_english_quotes=standard_english_quotes)
continue
if c in close_quotes:
if quote_depth < 1: # is there no current quote to close? move along, dropping this quotation mark
continue
if i == 0: # don't open the poem with a closing quote mark. Just move on.
continue
if i < (len(the_poem) - 1): # If we've got at least one more character in the poem, make sure this closing quote isn't actually an apostrophe.
if th._is_alphanumeric_char(the_poem[i-1]) and th._is_alphanumeric_char(the_poem[i+1]):
continue
ret += appropriate_quote(close_quotes, quote_depth, standard_english_quotes=standard_english_quotes)
quote_depth -= 1
continue
else:
ret += c
# before finishing, close any remaining open quotations.
while quote_depth > 0:
ret += appropriate_quote(close_quotes, quote_depth)
quote_depth -= 1
return ret
def balance_punctuation(the_poem: str,
opening_char: str,
closing_char: str) -> str:
"""Makes sure that paired punctuation (smart quotes, parentheses, brackets) in the
poem are 'balanced.' If not, it attempts to correct it.
Returns the entire (possibly modified) poem.
"""
opening, closing = the_poem.count(opening_char), the_poem.count(closing_char)
if closing_char == '’': # Sigh. We have to worry about apostrophes that look like closing single quotes.
closing -= len(re.findall('[:alnum:]*’[:alnum:]', the_poem)) # Inside a word? It's an apostrophe. Don't count it.
log_it("INFO: Balancing %s and %s (%d/%d)" % (opening_char, closing_char, opening, closing), 2)
if opening or closing: # Do nothing if there's no instances of either character
if opening != closing: # Do nothing if we already have equal numbers (even if not properly "balanced")
nesting_level = 0 # How many levels deep are we right now in the punctuation we're tracking?
indexed_poem = list(the_poem)
index = 0
while index <= (len(indexed_poem)-1):
char = indexed_poem[index]
next_char = '' if index == len(indexed_poem) -1 else indexed_poem[index + 1]
last_char = '' if index == 0 else indexed_poem[index - 1]
if index == (len(indexed_poem)-1) : # End of the poem?
if nesting_level > 0: # Close any open characters.
indexed_poem += [closing_char]
nesting_level -= 1
index += 1
elif char == opening_char: # Opening character?
if index == len(indexed_poem): # Last character is an opening character?
indexed_poem.pop(-1) # Just drop it.
else:
nesting_level += 1 # We're one level deeper
index += 1 # Move on to next character
elif char == closing_char: # Closing character?
if (closing_char == '’') and (th.is_alphanumeric(next_char) and th.is_alphanumeric(last_char)):
index += 1 # Skip apostrophes in the middle of words
else:
if nesting_level < 1: # Are we trying to close something that's not open?
indexed_poem.pop(index) # Just drop the spurious close quote
else:
if next_char.isspace(): # Avoid non-quote apostrophes in middle of words.
nesting_level -= 1 # We're one level less deep
index += 1 # Move on to next character
elif nesting_level > 0: # Are we currently in the middle of a bracketed block?
if next_char.isspace(): # Is the next character whitespace?
if random.random() < (0.001 * nesting_level): # Low chance of closing the open bracket
indexed_poem.insert(index, closing_char)
nesting_level -= 1
elif char in ['.', '?', '!'] and next_char.isspace():
if random.random() < (0.05 * nesting_level): # Higher chance of closing the open bracketer
indexed_poem.insert(index, closing_char)
nesting_level -= 1
if random.random() < 0.2: # Force new paragraph break?
indexed_poem.insert(index + 1, '\n')
elif char in known_punctuation and last_char in ['.', '!', '?']:
if random.random() < (0.05 * nesting_level):
indexed_poem.insert(index, closing_char)
nesting_level -= 1
if random.random() < 0.2: # Force new paragraph break?
indexed_poem.insert(index + 1, '\n')
elif char == '\n' and next_char == '\n': # Very high chance of closing on paragraph boundaries
if random.random() < (0.4 * nesting_level):
indexed_poem.insert(index, closing_char)
nesting_level -= 1
elif char == '\n':
if random.random() < (0.1 * nesting_level):
indexed_poem.insert(index, closing_char)
nesting_level -= 1
index += 1
else:
index += 1
the_poem = ''.join(indexed_poem)
log_it(" ... after balancing, there are %d/%d punctuation marks." % (the_poem.count(opening_char), the_poem.count(closing_char)), 3)
return the_poem
def curlify_quotes(the_poem: str,
straight_quote: str,
opening_quote: str,
closing_quote: str) -> str:
"""Goes through THE_POEM (a string) and looks for instances of STRAIGHT_QUOTE
(a single-character string). When it finds these instances, it substitutes
OPENING_QUOTE or CLOSING_QUOTE for them, trying to make good decisions about
which of those substitutions is appropriate.
IMPORTANT CAVEAT: this routine iterates over THE_POEM, making in-place
changes at locations determined via an initial scan. This means that
OPENING_QUOTE and CLOSING_QUOTE **absolutely must** have the same len() as
STRAIGHT_QUOTE, or else weird things will happen. This should not be a
problem with standard English quotes under Python 3.X; but it may fail under
non-Roman scripts, in odd edge cases, or if the function is used to try to
do something other than curlify quotes.
Returns the entire (possily modified) poem.
NOT FULLY TESTED, but I'm going to bed.
"""
log_it("INFO: curlify_quotes() called to differentiate %s (%d) into %s and %s" % (
straight_quote, the_poem.count(straight_quote), opening_quote, closing_quote), 2)
assert len(straight_quote) == 1, "Quote characters passed to curlify_quotes() must be one-character strings"
assert len(opening_quote) == 1, "Quote characters passed to curlify_quotes() must be one-character strings"
assert len(closing_quote) == 1, "Quote characters passed to curlify_quotes() must be one-character strings"
index = 0
while index < len(the_poem):
index = the_poem.find(straight_quote, index)
if index == -1:
break # We're done.
if index == 0: # Is it the first character of the poem?
the_poem = opening_quote + the_poem[1:]
elif index == len(the_poem): # Is it the last character of the poem?
the_poem = the_poem[:-1] + closing_quote
elif the_poem[index - 1].isspace() and the_poem[
index + 1].isspace(): # Whitespace on both sides? Replace quote with space.
the_poem = the_poem[:index] + ' ' + the_poem[1 + index:]
elif not the_poem[index - 1].isspace(): # Non-whitespace immediately before quote? It's a closing quote.
the_poem = the_poem[:index] + closing_quote + the_poem[index + 1:]
elif not the_poem[index + 1].isspace(): # Non-whitespace just after quote? It's an opening quote.
the_poem = the_poem[:index] + opening_quote + the_poem[index + 1:]
else: # Quote appears in middle of non-whitespace text ...
if straight_quote == '"':
the_poem = the_poem[:index - 1] + the_poem[index + 1:] # Just strip it out.
elif straight_quote == "'":
the_poem = the_poem[:index - 1] + closing_quote + the_poem[index + 1:] # Make it an apostrophe.
else:
raise NotImplementedError("We don't know how to deal with this quote: %s " % straight_quote)
return the_poem
def fix_punctuation(the_poem: str) -> str:
"""Cleans up the punctuation in the poem so that it appears to be more
'correct.' Since characters are generated randomly based on a frequency
analysis of which characters are likely to follow the last three to ten
characters, there's no guarantee that (for instance) parentheses or quotes
are balanced, because the generator doesn't pay attention to or understand
larger-scale structures.
THE_POEM is a string, which is the text of the entire poem; the function
returns a new, punctuation-fixed version of the poem passed in.
NOT YET FULLY IMPLEMENTED.
#FIXME: balancing punctuation needs to stop using heuristics and accurately
check structure.
#FIXME: we need to deal with the single/double quotes nesting problem.
"""
log_it("INFO: about to alter punctuation", 2)
the_poem = strip_invalid_chars(the_poem)
the_poem = curlify_quotes(the_poem, "'", "‘", "’")
the_poem = curlify_quotes(the_poem, '"', '“', '”')
the_poem = balance_punctuation(the_poem, "‘", "’")
the_poem = balance_punctuation(the_poem, '“', '”')
the_poem = balance_punctuation(the_poem, '(', ')')
the_poem = normalize_quotes(the_poem)
the_poem = balance_punctuation(the_poem, '[', ']')
return balance_punctuation(the_poem, '{', '}')
@functools.lru_cache(maxsize=4096)
def is_known_word(word: str) -> bool:
"""Returns True if WORD is known to be a word, based on lists of known words, and
False otherwise.
"""
return normalize_text(word, True) in phoneme_dict # FIXME! Check other syllable dictionaries, maybe?
def is_rejectable(sentence: str,
genny: typing.Type[tg.TextGenerator]) -> bool:
"""Checks to see if SENTENCE (a sentence) needs to be rejected, based on whatever
criteria are necessary to ensure pleasing poems. Requires that GENNY, a
fully-trained TextGenerator or subclass, be passed in, because it uses GENNY to
do word-splitting.
"""