Skip to content

Commit

Permalink
Merge pull request #69 from SupervisedStylometry/random
Browse files Browse the repository at this point in the history
random sampling à la Eder piu o meno
  • Loading branch information
Jean-Baptiste-Camps authored Jul 24, 2024
2 parents f40581f + 45c3bce commit 00f1458
Show file tree
Hide file tree
Showing 4 changed files with 53 additions and 12 deletions.
4 changes: 4 additions & 0 deletions load_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,9 @@
parser.add_argument('--sample_step', action='store', help="Step for sampling with overlap (default is no overlap)", default=None, type=int)
parser.add_argument('--max_samples', action='store', help="Maximum number of (randomly selected) samples per author/class (default is all)",
default=None, type=int)
parser.add_argument('--samples_random', action='store_true',
help="Should random sampling with replacement be performed instead of continuous sampling (default: false)",
default=False)
parser.add_argument('--keep_punct', action='store_true', help="whether to keep punctuation and caps (default is False)",
default=False)
parser.add_argument('--keep_sym', action='store_true',
Expand Down Expand Up @@ -59,6 +62,7 @@
relFreqs=not args.absolute_freqs, format=args.x,
sampling=args.sampling, units=args.sample_units,
size=args.sample_size, step=args.sample_step, max_samples=args.max_samples,
samples_random=args.samples_random,
keep_punct=args.keep_punct, keep_sym=args.keep_sym, identify_lang=args.identify_lang,
embedding=args.embedding, neighbouring_size=args.neighbouring_size
)
Expand Down
8 changes: 5 additions & 3 deletions superstyl/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
import pandas

def load_corpus(data_paths, feat_list=None, feats="words", n=1, k=5000, relFreqs=True, format="txt", sampling=False,
units="words", size=3000, step=None, max_samples=None, keep_punct=False, keep_sym=False,
identify_lang=False, embedding=False, neighbouring_size=10):
units="words", size=3000, step=None, max_samples=None, samples_random=False,
keep_punct=False, keep_sym=False, identify_lang=False, embedding=False, neighbouring_size=10):
"""
Main function to load a corpus from a collection of file, and an optional list of features to extract.
:param data_paths: paths to the source files
Expand All @@ -28,6 +28,7 @@ def load_corpus(data_paths, feat_list=None, feats="words", n=1, k=5000, relFreqs
:param step: step for sampling with overlap (default is step = size, which means no overlap).
Reduce for overlapping slices
:param max_samples: Maximum number of (randomly selected) samples per author/class (default is all)
:param samples_random: Should random sampling with replacement be performed instead of continuous sampling (default: false)
:param keep_punct: whether to keep punctuation and caps (default is False)
:param keep_sym: same as keep_punct, plus no Unidecode, and numbers are kept as well (default is False). /!\ does not
actually keep symbols
Expand All @@ -50,7 +51,8 @@ def load_corpus(data_paths, feat_list=None, feats="words", n=1, k=5000, relFreqs

if sampling:
myTexts = pipe.docs_to_samples(data_paths, format=format, units=units, size=size, step=step,
max_samples=max_samples, keep_punct=keep_punct, keep_sym=keep_sym,
max_samples=max_samples, samples_random=samples_random,
keep_punct=keep_punct, keep_sym=keep_sym,
identify_lang = identify_lang
)

Expand Down
36 changes: 27 additions & 9 deletions superstyl/preproc/pipe.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,11 +92,12 @@ def normalise(text, keep_punct=False, keep_sym=False):

else:
if keep_punct:
out = re.sub(r"[^\p{L}\p{P}]+", " ", text)
# Keep punctuation (and diacritics for now)
out = re.sub(r"[^\p{L}\p{P}\p{M}]+", " ", text)

else:
#out = re.sub(r"[\W0-9]+", " ", text.lower())
out = re.sub(r"[^\p{L}]+", " ", text.lower())
out = re.sub(r"[^\p{L}\p{M}]+", " ", text.lower())

out = unidecode.unidecode(out)

Expand Down Expand Up @@ -173,18 +174,27 @@ def load_texts(paths, identify_lang=False, format="txt", keep_punct=False, keep_


# Load and split in samples of length -n- a collection of files
def get_samples(path, size, step=None, units="words", format="txt", keep_punct=False, keep_sym=False):
def get_samples(path, size, step=None, samples_random=False, max_samples=10,
units="words", format="txt", keep_punct=False, keep_sym=False):
"""
Take samples of n words or verses from a document, and then parse it.
ONLY IMPLEMENTED FOR NOW: XML/TEI, TXT and verses or words as units
:param path : path to file
:param size: sample size
:param size: size of the step when sampling successively (determines overlap) default is the same
as sample size (i.e. no overlap)
:param samples_random: Should random sampling with replacement be performed instead of continuous sampling (default: false)
:param max_samples: maximum number of samples per author/clas
:param units: the units to use, one of "words" or "verses"
:param format: type of document, one of full text, TEI or simple XML (ONLY TEI and TXT IMPLEMENTED)
"""

if samples_random and step is not None:
raise ValueError("random sampling is not compatible with continuous sampling (remove either the step or the samples_random argument")

if samples_random and not max_samples:
raise ValueError("random sampling needs a fixed number of samples (use the max_samples argument)")

if step is None:
step = size

Expand Down Expand Up @@ -226,15 +236,21 @@ def get_samples(path, size, step=None, units="words", format="txt", keep_punct=F

# and now generating output
samples = []
current = 0
while current + size <= len(units):
samples.append({"start": current, "end": current + size, "text": list(units[current:(current + size)])})
current = current + step

if samples_random:
for k in range(max_samples):
samples.append({"start": str(k)+'s', "end": str(k)+'e', "text": list(random.choices(units, k=size))})

else:
current = 0
while current + size <= len(units):
samples.append({"start": current, "end": current + size, "text": list(units[current:(current + size)])})
current = current + step

return samples


def docs_to_samples(paths, size, step=None, units="words", format="txt", keep_punct=False,
def docs_to_samples(paths, size, step=None, units="words", samples_random=False, format="txt", keep_punct=False,
keep_sym=False, max_samples=None, identify_lang=False):
"""
Loads a collection of documents into a 'myTexts' object for further processing BUT with samples !
Expand All @@ -243,6 +259,7 @@ def docs_to_samples(paths, size, step=None, units="words", format="txt", keep_pu
:param size: size of the step when sampling successively (determines overlap) default is the same
as sample size (i.e. no overlap)
:param units: the units to use, one of "words" or "verses"
:param samples_random: Should random sampling with replacement be performed instead of continuous sampling (default: false)
:param format: type of document, one of full text, TEI or simple XML (ONLY TEI and TXT IMPLEMENTED)
:param keep_punct: whether to keep punctuation and caps.
:param max_samples: maximum number of samples per author/class.
Expand All @@ -264,7 +281,8 @@ def docs_to_samples(paths, size, step=None, units="words", format="txt", keep_pu
else:
lang = 'NA'

samples = get_samples(path, size=size, step=step, units=units, format=format,
samples = get_samples(path, size=size, step=step, samples_random=samples_random, max_samples=max_samples,
units=units, format=format,
keep_punct=keep_punct, keep_sym=keep_sym)

for sample in samples:
Expand Down
17 changes: 17 additions & 0 deletions tests/test_load_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -356,6 +356,23 @@ def test_docs_to_samples(self):
# THEN
self.assertEqual(len([text for text in results if text["aut"] == 'Smith']), 1)

# TODO: this is just minimal testing for random sampling
# WHEN
results = superstyl.preproc.pipe.docs_to_samples(self.paths, identify_lang=False, size=2, step=None,
units="words",
format="txt", keep_punct=False, keep_sym=False,
max_samples=5, samples_random=True)
# THEN
self.assertEqual(len([text for text in results if text["aut"] == 'Smith']), 5)

# and now tests that error are raised when parameters combinations are not consistent
# WHEN/THEN
self.assertRaises(ValueError, superstyl.preproc.pipe.docs_to_samples, self.paths, size=2, step=1, units="words",
format="txt", max_samples=5, samples_random=True)
self.assertRaises(ValueError, superstyl.preproc.pipe.docs_to_samples, self.paths, size=2, units="words",
format="txt", max_samples=None,
samples_random=True)

# TODO: test other loading formats with sampling, that are not txt (and decide on their implementation)

# Testing the processing of "myTexts" objects
Expand Down

0 comments on commit 00f1458

Please sign in to comment.