This repository has been archived by the owner on Jul 7, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 3.5k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #220 from vthorsteinsson/ice
Move Icelandic parsing problem to separate module
- Loading branch information
Showing
10 changed files
with
128 additions
and
74 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,117 @@ | ||
# Copyright 2017 The Tensor2Tensor Authors. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
# This module implements the ice_parsing_* problems, which | ||
# parse plain text into flattened parse trees and POS tags. | ||
# The training data is stored in files named `parsing_train.pairs` | ||
# and `parsing_dev.pairs`. These files are UTF-8 text files where | ||
# each line contains an input sentence and a target parse tree, | ||
# separated by a tab character. | ||
|
||
import os | ||
|
||
# Dependency imports | ||
|
||
from tensor2tensor.data_generators import generator_utils | ||
from tensor2tensor.data_generators import problem | ||
from tensor2tensor.data_generators import text_encoder | ||
from tensor2tensor.data_generators.wmt import tabbed_generator | ||
from tensor2tensor.utils import registry | ||
|
||
import tensorflow as tf | ||
|
||
|
||
# End-of-sentence marker. | ||
EOS = text_encoder.EOS_ID | ||
|
||
|
||
def tabbed_parsing_token_generator(data_dir, tmp_dir, train, prefix, | ||
source_vocab_size, target_vocab_size): | ||
"""Generate source and target data from a single file.""" | ||
filename = "parsing_{0}.pairs".format("train" if train else "dev") | ||
source_vocab = generator_utils.get_or_generate_tabbed_vocab( | ||
data_dir, tmp_dir, filename, 0, | ||
prefix + "_source.tokens.vocab.%d" % source_vocab_size, source_vocab_size) | ||
target_vocab = generator_utils.get_or_generate_tabbed_vocab( | ||
data_dir, tmp_dir, filename, 1, | ||
prefix + "_target.tokens.vocab.%d" % target_vocab_size, target_vocab_size) | ||
pair_filepath = os.path.join(tmp_dir, filename) | ||
return tabbed_generator(pair_filepath, source_vocab, target_vocab, EOS) | ||
|
||
|
||
def tabbed_parsing_character_generator(tmp_dir, train): | ||
"""Generate source and target data from a single file.""" | ||
character_vocab = text_encoder.ByteTextEncoder() | ||
filename = "parsing_{0}.pairs".format("train" if train else "dev") | ||
pair_filepath = os.path.join(tmp_dir, filename) | ||
return tabbed_generator(pair_filepath, character_vocab, character_vocab, EOS) | ||
|
||
|
||
@registry.register_problem("ice_parsing_tokens") | ||
class IceParsingTokens(problem.Problem): | ||
"""Problem spec for parsing tokenized Icelandic text to | ||
constituency trees, also tokenized but to a smaller vocabulary.""" | ||
|
||
@property | ||
def source_vocab_size(self): | ||
return 2**14 # 16384 | ||
|
||
@property | ||
def targeted_vocab_size(self): | ||
return 2**8 # 256 | ||
|
||
@property | ||
def input_space_id(self): | ||
return problem.SpaceID.ICE_TOK | ||
|
||
@property | ||
def target_space_id(self): | ||
return problem.SpaceID.ICE_PARSE_TOK | ||
|
||
@property | ||
def num_shards(self): | ||
return 10 | ||
|
||
def feature_encoders(self, data_dir): | ||
source_vocab_filename = os.path.join( | ||
data_dir, "ice_source.tokens.vocab.%d" % self.source_vocab_size) | ||
target_vocab_filename = os.path.join( | ||
data_dir, "ice_target.tokens.vocab.%d" % self.targeted_vocab_size) | ||
source_subtokenizer = text_encoder.SubwordTextEncoder(source_vocab_filename) | ||
target_subtokenizer = text_encoder.SubwordTextEncoder(target_vocab_filename) | ||
return { | ||
"inputs": source_subtokenizer, | ||
"targets": target_subtokenizer, | ||
} | ||
|
||
def generate_data(self, data_dir, tmp_dir, task_id=-1): | ||
generator_utils.generate_dataset_and_shuffle( | ||
tabbed_parsing_token_generator(data_dir, tmp_dir, True, "ice", | ||
self.source_vocab_size, | ||
self.targeted_vocab_size), | ||
self.training_filepaths(data_dir, self.num_shards, shuffled=False), | ||
tabbed_parsing_token_generator(data_dir, tmp_dir, False, "ice", | ||
self.source_vocab_size, | ||
self.targeted_vocab_size), | ||
self.dev_filepaths(data_dir, 1, shuffled=False)) | ||
|
||
def hparams(self, defaults, model_hparams): | ||
p = defaults | ||
source_vocab_size = self._encoders["inputs"].vocab_size | ||
p.input_modality = {"inputs": (registry.Modalities.SYMBOL, source_vocab_size)} | ||
p.target_modality = (registry.Modalities.SYMBOL, self.targeted_vocab_size) | ||
p.input_space_id = self.input_space_id | ||
p.target_space_id = self.target_space_id | ||
p.loss_multiplier = 2.5 # Rough estimate of avg number of tokens per word | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters