From 2ca3ccbb654d8ddca45e8b52c4112c924e154cfa Mon Sep 17 00:00:00 2001 From: bikramkhastgir <7278304+bikramkhastgir@users.noreply.github.com> Date: Tue, 20 Nov 2018 16:57:19 +0530 Subject: [PATCH 1/9] Corrected the path to the data, reports and models dir --- demo/seq2seq_train.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/demo/seq2seq_train.py b/demo/seq2seq_train.py index d6a4ee7..0a19844 100644 --- a/demo/seq2seq_train.py +++ b/demo/seq2seq_train.py @@ -12,9 +12,9 @@ def main(): np.random.seed(42) - data_dir_path = './data' - report_dir_path = './reports' - model_dir_path = './models' + data_dir_path = './demo/data' + report_dir_path = './demo/reports' + model_dir_path = './demo/models' print('loading csv file ...') df = pd.read_csv(data_dir_path + "/fake_or_real_news.csv") @@ -45,4 +45,4 @@ def main(): if __name__ == '__main__': - main() \ No newline at end of file + main() From 13949a89790dfc5c94eeb0af8b0583b15e71b7ec Mon Sep 17 00:00:00 2001 From: bikramkhastgir <7278304+bikramkhastgir@users.noreply.github.com> Date: Tue, 20 Nov 2018 17:03:17 +0530 Subject: [PATCH 2/9] Corrected path to the models folder In Python 2.7, the program needs to be run from 'keras-text-summarization' folder with the command 'python demo/seq2seq_train.py' as it will resolve the relative path mismatch from the demo folder itself. The error encountered otherwise is 'No module named keras_text_summarization.library.utility.plot_utils' --- keras_text_summarization/library/seq2seq.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/keras_text_summarization/library/seq2seq.py b/keras_text_summarization/library/seq2seq.py index 4395a96..361c72c 100644 --- a/keras_text_summarization/library/seq2seq.py +++ b/keras_text_summarization/library/seq2seq.py @@ -134,7 +134,7 @@ def fit(self, Xtrain, Ytrain, Xtest, Ytest, epochs=None, batch_size=None, model_ if epochs is None: epochs = DEFAULT_EPOCHS if model_dir_path is None: - model_dir_path = './models' + model_dir_path = './demo/models' if batch_size is None: batch_size = DEFAULT_BATCH_SIZE From da2ceda9d74ccf3adf25caf21cf9367f59a8470c Mon Sep 17 00:00:00 2001 From: bikramkhastgir <7278304+bikramkhastgir@users.noreply.github.com> Date: Tue, 20 Nov 2018 17:06:28 +0530 Subject: [PATCH 3/9] Support for urllib library for Python 2.7 Urllib Library for Python V 2 and 3 have a slightly different syntax and functionality. Added the changes to support Python 2.7 while training. --- .../library/utility/glove_loader.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/keras_text_summarization/library/utility/glove_loader.py b/keras_text_summarization/library/utility/glove_loader.py index 9bd14dd..df81e39 100644 --- a/keras_text_summarization/library/utility/glove_loader.py +++ b/keras_text_summarization/library/utility/glove_loader.py @@ -1,4 +1,7 @@ -import urllib.request +try: + import urllib.request +except: + import urllib import os import sys import zipfile @@ -33,8 +36,12 @@ def download_glove(data_dir_path=None): if not os.path.exists(glove_zip): print('glove file does not exist, downloading from internet') - urllib.request.urlretrieve(url='http://nlp.stanford.edu/data/glove.6B.zip', filename=glove_zip, + if sys.version[0]=="3": + urllib.request.urlretrieve(url='http://nlp.stanford.edu/data/glove.6B.zip', filename=glove_zip, reporthook=reporthook) + elif sys.version[0]=="2": + urllib.urlretrieve(url='http://nlp.stanford.edu/data/glove.6B.zip', filename="glove_zip", reporthook=reporthook) + print('unzipping glove file') zip_ref = zipfile.ZipFile(glove_zip, 'r') From 04c7bb201938bcda16348021b08bb4c97064c5f6 Mon Sep 17 00:00:00 2001 From: bikramkhastgir <7278304+bikramkhastgir@users.noreply.github.com> Date: Tue, 20 Nov 2018 17:08:10 +0530 Subject: [PATCH 4/9] More general way of running the train program from the base folder --- README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/README.md b/README.md index 42b5fa3..fbcff20 100644 --- a/README.md +++ b/README.md @@ -56,8 +56,7 @@ To train a deep learning model, say Seq2SeqSummarizer, run the following command ```bash pip install requirements.txt -cd demo -python seq2seq_train.py +python demo/seq2seq_train.py ``` The training code in seq2seq_train.py is quite straightforward and illustrated below: From dc393f9a7a32da5493833b1b8173d4a6c191dbd2 Mon Sep 17 00:00:00 2001 From: bikramkhastgir <7278304+bikramkhastgir@users.noreply.github.com> Date: Mon, 24 Dec 2018 06:43:45 +0530 Subject: [PATCH 5/9] Add files via upload --- demo/seq2seq_predict.py | 68 ++++++++++++++--------------- demo/seq2seq_train.py | 96 ++++++++++++++++++++--------------------- 2 files changed, 82 insertions(+), 82 deletions(-) diff --git a/demo/seq2seq_predict.py b/demo/seq2seq_predict.py index 8ac195a..4331b1a 100644 --- a/demo/seq2seq_predict.py +++ b/demo/seq2seq_predict.py @@ -1,34 +1,34 @@ -from __future__ import print_function - -import pandas as pd -from keras_text_summarization.library.seq2seq import Seq2SeqSummarizer -import numpy as np - - -def main(): - np.random.seed(42) - data_dir_path = './data' - model_dir_path = './models' - - print('loading csv file ...') - df = pd.read_csv(data_dir_path + "/fake_or_real_news.csv") - X = df['text'] - Y = df.title - - config = np.load(Seq2SeqSummarizer.get_config_file_path(model_dir_path=model_dir_path)).item() - - summarizer = Seq2SeqSummarizer(config) - summarizer.load_weights(weight_file_path=Seq2SeqSummarizer.get_weight_file_path(model_dir_path=model_dir_path)) - - print('start predicting ...') - for i in np.random.permutation(np.arange(len(X)))[0:20]: - x = X[i] - actual_headline = Y[i] - headline = summarizer.summarize(x) - # print('Article: ', x) - print('Generated Headline: ', headline) - print('Original Headline: ', actual_headline) - - -if __name__ == '__main__': - main() +from __future__ import print_function + +import pandas as pd +from keras_text_summarization.library.seq2seq import Seq2SeqSummarizer +import numpy as np + + +def main(): + np.random.seed(42) + data_dir_path = './data' + model_dir_path = './models' + + print('loading csv file ...') + df = pd.read_csv(data_dir_path + "/fake_or_real_news.csv") + X = df['text'] + Y = df.title + + config = np.load(Seq2SeqSummarizer.get_config_file_path(model_dir_path=model_dir_path)).item() + + summarizer = Seq2SeqSummarizer(config) + summarizer.load_weights(weight_file_path=Seq2SeqSummarizer.get_weight_file_path(model_dir_path=model_dir_path)) + + print('start predicting ...') + for i in np.random.permutation(np.arange(len(X)))[0:20]: + x = X[i] + actual_headline = Y[i] + headline = summarizer.summarize(x) + # print('Article: ', x) + print('Generated Headline: ', headline) + print('Original Headline: ', actual_headline) + + +if __name__ == '__main__': + main() diff --git a/demo/seq2seq_train.py b/demo/seq2seq_train.py index 0a19844..d4c4453 100644 --- a/demo/seq2seq_train.py +++ b/demo/seq2seq_train.py @@ -1,48 +1,48 @@ -from __future__ import print_function - -import pandas as pd -from sklearn.model_selection import train_test_split -from keras_text_summarization.library.utility.plot_utils import plot_and_save_history -from keras_text_summarization.library.seq2seq import Seq2SeqSummarizer -from keras_text_summarization.library.applications.fake_news_loader import fit_text -import numpy as np - -LOAD_EXISTING_WEIGHTS = False - - -def main(): - np.random.seed(42) - data_dir_path = './demo/data' - report_dir_path = './demo/reports' - model_dir_path = './demo/models' - - print('loading csv file ...') - df = pd.read_csv(data_dir_path + "/fake_or_real_news.csv") - - print('extract configuration from input texts ...') - Y = df.title - X = df['text'] - - config = fit_text(X, Y) - - summarizer = Seq2SeqSummarizer(config) - - if LOAD_EXISTING_WEIGHTS: - summarizer.load_weights(weight_file_path=Seq2SeqSummarizer.get_weight_file_path(model_dir_path=model_dir_path)) - - Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.2, random_state=42) - - print('demo size: ', len(Xtrain)) - print('testing size: ', len(Xtest)) - - print('start fitting ...') - history = summarizer.fit(Xtrain, Ytrain, Xtest, Ytest, epochs=100) - - history_plot_file_path = report_dir_path + '/' + Seq2SeqSummarizer.model_name + '-history.png' - if LOAD_EXISTING_WEIGHTS: - history_plot_file_path = report_dir_path + '/' + Seq2SeqSummarizer.model_name + '-history-v' + str(summarizer.version) + '.png' - plot_and_save_history(history, summarizer.model_name, history_plot_file_path, metrics={'loss', 'acc'}) - - -if __name__ == '__main__': - main() +from __future__ import print_function + +import pandas as pd +from sklearn.model_selection import train_test_split +from keras_text_summarization.library.utility.plot_utils import plot_and_save_history +from keras_text_summarization.library.seq2seq import Seq2SeqSummarizer +from keras_text_summarization.library.applications.fake_news_loader import fit_text +import numpy as np + +LOAD_EXISTING_WEIGHTS = False + + +def main(): + np.random.seed(42) + data_dir_path = './data' + report_dir_path = './reports' + model_dir_path = './models' + + print('loading csv file ...') + df = pd.read_csv(data_dir_path + "/fake_or_real_news.csv") + + print('extract configuration from input texts ...') + Y = df.title + X = df['text'] + + config = fit_text(X, Y) + + summarizer = Seq2SeqSummarizer(config) + + if LOAD_EXISTING_WEIGHTS: + summarizer.load_weights(weight_file_path=Seq2SeqSummarizer.get_weight_file_path(model_dir_path=model_dir_path)) + + Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.2, random_state=42) + + print('demo size: ', len(Xtrain)) + print('testing size: ', len(Xtest)) + + print('start fitting ...') + history = summarizer.fit(Xtrain, Ytrain, Xtest, Ytest, epochs=100) + + history_plot_file_path = report_dir_path + '/' + Seq2SeqSummarizer.model_name + '-history.png' + if LOAD_EXISTING_WEIGHTS: + history_plot_file_path = report_dir_path + '/' + Seq2SeqSummarizer.model_name + '-history-v' + str(summarizer.version) + '.png' + plot_and_save_history(history, summarizer.model_name, history_plot_file_path, metrics={'loss', 'acc'}) + + +if __name__ == '__main__': + main() \ No newline at end of file From e4ad1e82c2c021e15afd45c31534b607ee457328 Mon Sep 17 00:00:00 2001 From: bikramkhastgir <7278304+bikramkhastgir@users.noreply.github.com> Date: Mon, 24 Dec 2018 06:48:21 +0530 Subject: [PATCH 6/9] Add files via upload --- demo/seq2seq_predict.py | 8 ++++---- demo/seq2seq_train.py | 12 ++++++------ 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/demo/seq2seq_predict.py b/demo/seq2seq_predict.py index 4331b1a..b5121ad 100644 --- a/demo/seq2seq_predict.py +++ b/demo/seq2seq_predict.py @@ -7,13 +7,13 @@ def main(): np.random.seed(42) - data_dir_path = './data' - model_dir_path = './models' + data_dir_path = './demo/data' + model_dir_path = './demo/models' print('loading csv file ...') - df = pd.read_csv(data_dir_path + "/fake_or_real_news.csv") + df = pd.read_csv(data_dir_path + "/wiki_v_small.csv") # "/fake_or_real_news.csv") X = df['text'] - Y = df.title + Y = df['summary'] config = np.load(Seq2SeqSummarizer.get_config_file_path(model_dir_path=model_dir_path)).item() diff --git a/demo/seq2seq_train.py b/demo/seq2seq_train.py index d4c4453..62ce7e8 100644 --- a/demo/seq2seq_train.py +++ b/demo/seq2seq_train.py @@ -12,15 +12,15 @@ def main(): np.random.seed(42) - data_dir_path = './data' - report_dir_path = './reports' - model_dir_path = './models' + data_dir_path = './demo/data' + report_dir_path = './demo/reports' + model_dir_path = './demo/models' print('loading csv file ...') - df = pd.read_csv(data_dir_path + "/fake_or_real_news.csv") + df = pd.read_csv(data_dir_path + "/wikihowAll.csv") print('extract configuration from input texts ...') - Y = df.title + Y = df['summary'] X = df['text'] config = fit_text(X, Y) @@ -45,4 +45,4 @@ def main(): if __name__ == '__main__': - main() \ No newline at end of file + main() From 2b6a420af02dc0b2495b74fb5f8d76210589924d Mon Sep 17 00:00:00 2001 From: bikramkhastgir <7278304+bikramkhastgir@users.noreply.github.com> Date: Mon, 24 Dec 2018 06:49:29 +0530 Subject: [PATCH 7/9] Add files via upload --- demo/data/test1 | 10 ++ demo/data/wiki_v_small.csv | 201 +++++++++++++++++++++++++++++++++++++ 2 files changed, 211 insertions(+) create mode 100644 demo/data/test1 create mode 100644 demo/data/wiki_v_small.csv diff --git a/demo/data/test1 b/demo/data/test1 new file mode 100644 index 0000000..73cfb59 --- /dev/null +++ b/demo/data/test1 @@ -0,0 +1,10 @@ +An Indian-American woman has been elected as the president of the powerful student's body of the prestigious Harvard University. +Sruthi Palaniappan, 20, whose parents migrated to the US from Chennai in 1992, was elected as president of the Harvard University Undergraduate Council. +Her running mate Julia Huesa, 20, was elected as vice president, according to an announcement by the Undergraduate Council Election Commission. +Ms Palaniappan said that Ms Huesa and her planned to work on improving the Council's communication with the student body in their initial days in office. +"I think from the onset, better structuring the way we communicate with students is something that we need to already set the tone and plan for," she said. +"I think we're going to work on it before we even leave for break and just get off the ground running," she told Harvard Crimson, the student newspaper of the varsity. +Ms Palaniappan was the youngest delegate at the Democratic National Convention in Philadelphia in July 2016. +According to the report, Ms Palaniappan and Ms Huesa garnered nearly 41.5 per cent of the votes as against their nearest opponent Nadine M Khoury and Arnav Agrawal, who received 26.6 per cent of the votes. +They ran their campaign under the slogan "Make Harvard Home". +The duo, a long-time member of the Undergraduate Council, are scheduled to take over from the outgoing president Catherine L Zhang '19 and vice president Nicholas D Boucher '19. \ No newline at end of file diff --git a/demo/data/wiki_v_small.csv b/demo/data/wiki_v_small.csv new file mode 100644 index 0000000..a58277d --- /dev/null +++ b/demo/data/wiki_v_small.csv @@ -0,0 +1,201 @@ +headline,title,text +" +Keep related supplies in the same area., +Make an effort to clean a dedicated workspace after every session., +Place loose supplies in large, clearly visible containers., +Use clotheslines and clips to hang sketches, photos, and reference material., +Use every inch of the room for storage, especially vertical space., +Use chalkboard paint to make space for drafting ideas right on the walls., +Purchase a label maker to make your organization strategy semi-permanent., +Make a habit of throwing out old, excess, or useless stuff each month.",How to Be an Organized Artist1," If you're a photographer, keep all the necessary lens, cords, and batteries in the same quadrant of your home or studio. Paints should be kept with brushes, cleaner, and canvas, print supplies should be by the ink, etc. Make broader groups and areas for your supplies to make finding them easier, limiting your search to a much smaller area. Some ideas include: + + +Essential supplies area -- the things you use every day. +Inspiration and reference area. +Dedicated work area . +Infrequent or secondary supplies area, tucked out of the way.; +, This doesn't mean cleaning the entire studio, it just means keeping the area immediately around the desk, easel, pottery wheel, etc. clean each night. Discard trash or unnecessary materials and wipe down dirty surfaces. Endeavor to leave the workspace in a way that you can sit down the next day and start working immediately, without having to do any work or tidying. + + +Even if the rest of your studio is a bit disorganized, an organized workspace will help you get down to business every time you want to make art. + +, As visual people, a lot of artist clutter comes from a desire to keep track of supplies visually instead of tucked out of sight. By using jars, old glasses, vases, and cheap, clear plastic drawers, you can keep things in sight without leaving it strewn about haphazardly. Some ideas, beyond those just mentioned, include: + + +Canvas shoe racks on the back of the door +Wine racks with cups in each slot to hold pens/pencils. +Plastic restaurant squirt bottles for paint, pigment, etc., Simply string up the wires across a wall or along the ceiling and use them to hold essential papers that you don't want to cut or ruin with tacks or tape. Cheap and easy, this is also a good way to handle papers and ideas you touch regularly or need to pin up and down for inspiration., Shelving is an artist's best friend and is a cheap and easy way to get more room in your studio or art space. Don't be afraid to get up high either, especially for infrequently used supplies. The upper reaches of the room are often the most under-utilized, but provide vital space for all your tools and materials., Turning one wall into a chalkboard gives you a perfect space for ideas, sketches, and planning without requiring extra equipment or space. You can even use it for smaller areas. Paint over jars or storage equipment, allowing you to relabel them with chalk as your needs change. + +, A lot of disorganization comes when you keep moving the location of things, trying to optimize your space by reorganizing frequently. This usually has the opposite effect, leading to lost items and uncertainty when cleaning, but an afternoon with a label maker can solve everything. Instead of spending all of your mental energy looking for or storing things, you can just follow the labels, freeing your mind to think about art., Once a month, do a purge of your studio. If it isn't essential or part of a project, either throw it out or file it away for later. Artists are constantly making new things, experimenting, and making a mess. This is a good thing, but only if you set aside time to declutter. It may not be fun at the moment, but it is a lot more fun than spending 30 minutes digging through junk to find the right paint or an old sketch. + + +Don't be sentimental here. If you haven't used it in the last six months there is little chance you'll use it in the next six months. Toss it. + +" +" +Create a sketch in the NeoPopRealist manner of the future mural on a small piece of paper 8""x10"" using the black ink pen., +Prepare to create your NeoPopRealist mural., +Prepare your paint., +Begin your project with a design., +Produce a scaled down version of your finished mural., +Prepare the wall to be painted., +After you have primed the surface, measure the wall., +Paint in the base coat of the background., +Allow the background and base coats to dry., +Draw the lines, then fill the appeared section with different repetitive patterns (examine the images above)., +Paint patterns with brushes of suitable size for the particular portion of work you are painting., +Clean up the lines and shapes as needed., +Seal the mural if needed., +Be inspired and it will help you succeed!",How to Create a Neopoprealist Art Work," See the image for how this drawing develops step-by-step. However, there is an important detail: the following drawings are to examine it, and then, to create something unique. + + +Use the lines to create the image shape and sections. +Fill appeared sections with different patterns/ ornaments. +Add text if needed, for example ""NeoPopRealism is 25!"" +Add a colored strip on the top, any color you wish.; +, Painting a mural always requires some preparation. You‘ll need equipment and effort, but planning and attention to detail will help you succeed. Painting a mural requires a suitable location, with the right surface that can be painted. + +This surface should be smooth and flat. However, even rough-textured surfaces can be used for your NeoPopRealist mural project. + +, For exterior projects that last for years, using a newer 100% acrylic exterior paint would be your best choice. For interior walls, use latex paints. Latex offer easier cleanup and lower costs. By measuring the total wall area to be covered, the total amount of paint can be calculated, but since this mural painting requires two colors - white and black - figuring the actual area to be painted each color is necessary to allow purchasing the right amount of each one. + +Large walls backgrounds may be rolled or sprayed with a white paint sprayer, where details may be added with brushes. +Paints are sensitive to high temperatures, humidity, direct sunlight, however, the interior projects do not have many complications. In public places, keeping the mural protected may require attention. For that reason, if you make your NeoPopRealist mural dedicated to its 25-year anniversary in school or office, you can consider using varnish for your mural. + +, See sample above. The design will give you a sense of proportion. You will have unique requirements and elements., Use a sketch and measure at scale the distances and locations of various points of your subject. Measuring key features will help you calculate the amount of paint when each feature is identified by its color., If the surface is low, the whole mural can be painted standing on the ground or from a stepladder. For higher work, you may have to rent a scaffold., Mark the horizontal and vertical lines., Use white paint as background., Then begin marking, using your scaled sketch, the location of key elements of objects, located in the foreground. Everything depends on the complexity of your mural. If you are confident in your artistic results, you may choose to draw all of the details in freehand., Be careful, keep clean transition edges from one color (black) to another (white). However, mistakes can be touched up later. Always allow fresh color to dry before proceeding to the drawing., An example would be painting a large patterns use big brushes, limbs, use small brushes for tiny and detailed patterns., If you have a drip or run, paint over it with the color paint appropriate for that location. Sharpen lines and patterns if they are blurred., If it is intended to last a long time or if it is on a surface that require cleaning, overcoat your mural project with a clear sealer., However, if you won't be able to involve more than 16 percent of your brain's grays matter, you'll end up with the primitive crafting of patterns or even worse, with doodling or so-called zen-doodling. But to create the NeoPopRealist ART one needs the abilities, which can be developed in talented people by studying using Nadia Russ' Neopoprealist instructional books. Other, the copycats' self-promotional superficial books will teach you only how to doodle because they have nothing in common with visual arts and its mission. + +" +" +Get a bachelor’s degree., +Enroll in a studio-based program., +Train on a number of VFX computer programs., +Watch online tutorials., +Nurture your artistic side., +Pay close attention to movies, television shows, and video games., +Develop a specialization.",How to Be a Visual Effects Artist1," It is possible to become a VFX artist without a college degree, but the path is often easier with one. VFX artists usually major in fine arts, computer graphics, or animation. Choose a college with a reputation for strength in these areas and a reputation for good job placement for graduates. The availability of internships is another factor to consider.Out of the jobs advertised for VFX artists, a majority at any given time specify a bachelor’s degree as a minimum requirement for applicants.; +, Some studios offer short-term programs for people who want to learn more about VFX artistry without pursuing a college degree. Enrolling in these programs can be expensive as financial aid isn’t always offered, but they usually have the most cutting edge technology for you to learn from., Although you may create some hand sketches, the majority of your work will be completed on the computer using the most up-to-date programs. Stay informed about the newest software advances by following VFX blogs and taking online computer tutorials.For example, VFX artists are expected to be well-versed in graphics and animation programs, such as Adobe Creative Suite and JavaScript.Clearly list every program that you can work with on your resume. + +, Hop onto YouTube or another video service and search for VFX clip reels or demonstrations. Some of these videos will focus on a particular skill set, such as shading, which you then can practice on your own. Challenge yourself to mimic some of the more difficult tasks, or even try to improve upon the models used., Take as many art and design classes as you can. Or, simply carry a sketch pad around with you to work on your basic animation skills. As you draw, consider factors such as lighting and framing. Even geometry skills can come in handy when creating a particular type of background or even a person’s face.Make a choice to become an observer of the world around you. Ask yourself: how could I capture the movement of the leaves? Or, in what situations do shadows appear? + +, Watch all of these creations with an eye for detail. Look for the techniques used and any original approaches that you see. Try to recreate any scenes that you find particularly interesting. Research the artists and see what their backgrounds are and contact them if you like., As you gain more experience, you’ll likely find yourself gravitating toward a certain aspect of design. This will become your “calling card” and directors and other professionals will seek you out for this type of work. To build your specialization, start choosing jobs with that emphasis and attend additional training seminars.For example, some VFX specialists focus on human character’s faces, animal figures, or city backgrounds. + +" +" +Start with some experience or interest in art., +Understand the difference between art collectors, art investors and art speculators., +Figure out what you are willing to pay for art, before going to an auction house., +Pay attention to what schools of art are selling well, and which are down., +Focus art investments on fine art paintings, rather than decorative art., +Reach out to trusted auction houses and dealers when you are looking to buy art., +Buy your investment art when you feel confident of its worth, its price and its ability to grow in value., +Study how art is properly stored., +Have your art investments appraised occasionally., +Consider renting out your art investments., +Understand that selling an art investment can take time.",How to Become an Art Investor," The best art investors do their research on the pieces of art that they buy, so someone with some education or interest in the art world is more likely to understand this niche market. As well as personal research, you will need to have contacts with people in the art world, such as auctioneers, gallery directors and dealers, who can give you good investment advice.; +, You may confuse these three terms, if you are not careful. Each of them has a slightly different goal in mind when looking to buy art. + + +Art collectors do not buy art for investment purposes. They buy it to decorate and display in their home. Because they consider them to be an important part of their home or life, most art collectors have a hard time parting with pieces of their collection. While many collectors do end up selling some pieces of art, it may be done because of necessity. Collectors often loan their works out to museums and occasionally donate them to museums upon their death. +Art investors seek to diversify their portfolio with an art investment. Some investment firms put about two and a half to three percent of their investment money in art. They seek good advice and often buy paintings that are older and have been popular historically, such as paintings by the Old Masters. These investments are kept over decades, and sold off when the market is right, with the investor seeking to get a six to ten percent profit rise per year. These investments are also often made to be given as inheritance to future generations of the family. Art investment is often undertaken by the very wealthy. +Art speculators try to invest in art that they believe will appreciate in value. They aim to buy art at a low price from budding artists in the beginning of their careers. Then they hope to sell their work in 10 to 15 years when the artists are at the peak of their careers and people or collectors are willing to pay much more for their pieces. This is a type of investment that takes intimate involvement with the art world and liquidity in order to buy the art. + +, Art investments should be no more than a small part of your investment portfolio, along with stocks, bonds, new businesses and more. Figure out what your range is before you begin to pick out potential pieces, and get advice from investors and art dealers. + +, Study the Mei Moses Fine Art Index to get a firm grasp of the art market today. Although they cannot predict what will be popular in the future, they can tell you what art tends to keep its value and be a low-risk and what art has a more volatile market value. + +, While this is not an absolute rule to follow, paintings from successful artists tend to get better returns than sculpture and installation art. + +, Get all the information you can out of them before making a purchase. If you are going to buy at auction, be prepared to walk away if the price goes higher than your investment range. + + +Beware of art auctioneers or dealers that promise too high a return on paintings. In many ways, they are just like stock brokers and other financial investment firms, who may promise anything to get a sale. They should be found trustworthy before you buy anything, because ponzi schemes and art market bubbles are part of this investment landscape as well. +If you have never bought art in an auction environment before, you may want to seek advice about how it is done properly. You should study the auction booklet before hand, learn about secret buyers and how prices can quickly inflate. Most fine auction houses would be willing to teach you the basics if they see you as a serious investor. + +, Arrange for payment, shipping and insurance. Each piece of art should be insured and catalogued as part of your estate. + +, In order for an art investment to retain its value, it should be kept at low-humidity and avoid being marred. You may choose to hang it in your home, but you may want to get an art collector's advice about where to hang it and how to care for it. + + +Share this information with your children, if it is intended as an inheritance investment. They need to be well aware of how to take care of art, or they may lose money or ruin the painting entirely. + +, As well as keeping tabs on the art world to understand the rise and falls of certain schools of art, an appraiser can tell you how your investment is maturing. They may clue you in on when you have reached your desired profit. + +, If the art you buy does not fit in your home, and you will only be storing it, research banks, hotels and other institutions that rent fine art on a rotating basis. You may be able to demand thousands of dollars per year for your art to hang in another building. Keep in mind you will need to make sure your art is covered by an insurance policy for loss or damage. + + +Make sure the renter provides insurance for your art. You should prepare a contract that stipulates the time allotted, the fee, the insurance and the shipment of your art. + +, Unless you employ an art dealer or auction house at the exact moment when the art is in high value, it can take years and thousands of dollars in fees to find the right buyer. + +" +" +Keep your reference materials, sketches, articles, photos, etc, in one easy to find place., +Make ""studies,"" or practice sketches, to organize effectively for larger projects., +Limit the supplies you leave out to the project at hand., +Keep an updated list of all of the necessary supplies, and the quantities of each., +Break down bigger works into more easily completed parts.",How to Be an Organized Artist2," As you start planning for a project or work, you'll likely be gathering scraps of inspiration and test sketches. While everyone has a strategy, there is nothing more maddening than digging through a book or the internet to re-find the cool idea you saw three months ago. Try out: + + +Dedicating 1 notebook, preferably with insert folders, to each project. +Making a bookmark folder for each project on your internet browser to easily compile online inspiration. +Tacking up physical inspiration on a wall or cork board near your workspace., Very few artists simply dive right into large projects. Almost 100% of the time they instead work on related, smaller projects called ""studies"" to prepare for the larger work. You might practice the face of the portrait you're making, sketch our different composition ideas, or practice a vulnerable or difficult part of a sculpture. Keep these organized as a way to prepare both the skills, ideas, and supplies needed for the final project. + +, At the end of the day, artists are visual people, and tucking everything away neatly and cleanly may not be conducive to the artistic process. Of course, neither is losing or misplacing essential supplies. Find a compromise by packing away any supplies not currently in use, and leaving a little bit of ""essential"" clutter. It's okay to have inspiration scattered around the studio -- just make sure it's the inspiration you need for the current project. + + +Just because you ""aren't organized"" is no excuse not to make an attempt. Don't feel like the only options are perfect cleanliness or an utter mess-- there is a middle ground. + +, Nothing is worse than spending a long night on a painting only to realize you've run out of white paint halfway through a section. Once a week, or more frequently if possible, check in on the quantities of your supplies so that you can refill them before it becomes a problem. + + +A simple spreadsheet or notebook, marked at the end of each artistic session, is a quick and easy way to keep tabs on your stuff. + +, Deciding to paint a mural is a huge undertaking. But sketching the idea, transposing the image onto the wall, painting the basic colors, then adding shading/detail are four separate and more manageable projects. Organization is key to big projects, even if it feels ""constraining"" to your creativity. In reality, organizing your work and progress frees your mind to actually be creative, instead of worrying about logistics. + + +Figure out the building blocks of each part of the project, tackling each at once. Don't jump around across all parts of the project haphazardly. + +" +" +Keep all of your past work organized and accessible., +Record all of your artistic contacts and connections in one place., +Log the costs of supplies needed for each project., +Find out how much it costs you to make and sell each piece., +Pay attention to what other, similar works of art are selling for., +Consider the ""cost"" of your time when pricing work.",How to Be an Organized Artist3," When you finish a project, whether it sells or not, don't just stuff it away in a drawer. You never know when you'll want to revisit and idea or, more excitingly, when interest in your current work will drive up interest in past projects. + + +If you do electronic work, back it up every 3-6 months on a dedicated hard drive. There is nothing worse than an accident destroying all of your old projects., More than many industries, successful artists need to cultivate a diverse network of other artists, curators, instructors, and gallery assistants to be successful. You never know when someone will hit it big and provide a helping hand, or when you'll have some work you want to place in a friend's art show. Don't leave meetings and connections up to chance -- organize and compile your contact information in once place for later. Make notes of: + + +Phone number +Email +Location +Role in the art world +How you met or connected., If you're looking to make a living off art, you need to treat certain aspects of the process like a business. This, however, does not need to interfere with your creative process. Simply holding on to your receipts and writing them down in one sheet is a great first step towards financial security and autonomy. + + +You can often write off almost all of these receipts on your taxes, as they are private business expenses. Keeping expenses organized isn't just about time, it is about saving money., If you're making the same or similar pieces each day, you can figure out how much each piece costs you to make them by dividing the cost in supplies by the number of pieces made. So, if 10 wood sculptures cost you $100, each sculpture cost you $10 to make (100/10 = 10). This may seem trivial, but you need to have a complete picture of your finances if you hope to make money off your work. + + +At the very least, ensure that you are not losing money on each piece. + +, If you want an organized, productive artistic practice, you need to know about the trends around your work. Keeping organized is about more than just your own studio, it is about understanding the art market you're a part of. Peruse Etsy, visit galleries and show openings, and follow art blogs and news to keep abreast of the latest developments and prices. + +, While it doesn't have a dollar value, make sure you value your time as well as your materials. In the earlier example, don't forget that it cost more than just $10 to make the sculpture. Your hours of work and experience are essential too, so don't sell the piece for $20 if it took you a week to make. While pricing your work is difficult, don't balk at the high prices you see other artists selling for -- not everyone can do what you do, and people are paying for your talent and experience. + + +At the very least, consider what you'd be paid if you used your time otherwise. Twenty hours spent painting could be worth $15 an hour at another job. You should consider this ""missed"" money when pricing work. +If you want to make art your living, you need to price the work high enough to take care of yourself. Careful financial organization is essential to pulling this off. + +" +" +Create a compelling reel or portfolio., +Land an internship., +Consider self-employment., +Sign on with a design company or studio., +Move up to a supervisor position.",How to Be a Visual Effects Artist2," This should be a short video showcasing the breadth and depth of your skills as an artist. Some choose to follow a storyline format while others cycle through a series of clips. Most college programs will give you time to create this work in your junior or senior years using professional grade equipment and software.Your reel is also a chance for you to showcase any unique skills that you possess, such as drawing or sculpting.Don’t be afraid to work with other artists to create your reel. Showing that you can collaborate well with others is something that studios often look for when hiring. + +, If you are enrolled in a VFX program, talk to your career counselors to see what opportunities might be available. If you are developing your skills on your own, reach out to studios to see if they have any spots for paid or unpaid interns. This will give a potential employer a chance to get to know you and might make it easier for them to hire you in the future., Over half of VFX artists are their own bosses. Being a freelance designer gives you more control over your schedule and project selection. But, it also means that you will need to handle administrative tasks and could struggle with bringing in a consistent income.If this path interests you, it would be wise to take a few classes in marketing and accounting. Getting established is a hard struggle that many self-employed artists face.Choosing your own projects can mean that you’ll establish a specialty more quickly than you might in a large studio. However, affording the most recent design equipment may be tough, depending on how successful you are. + +, This is a more traditional option where you agree to full or part-time employment with an established company. Your work schedule and salary will very much depend on the prominence of the company and the types of projects that come in. However, you may get the opportunity to work on some big Hollywood blockbusters or top-rated television shows!Get your foot in the door of larger companies by serving as a junior 2D artist or runner. A junior artist will assist those with seniority by creating basic outlines for scenery and the like. A runner literally runs sketches and communications between the VFX team and a director, for example., With enough time, effort, and luck, you may get the chance to take on a leadership position for a project. As a supervisor, the final product that audiences see is your responsibility. You’ll take all of the raw images and make them fit together while working alongside the production team." From 7247f9ed09b621c3ccfa4abf8655d57b54959938 Mon Sep 17 00:00:00 2001 From: bikramkhastgir <7278304+bikramkhastgir@users.noreply.github.com> Date: Mon, 24 Dec 2018 06:54:33 +0530 Subject: [PATCH 8/9] Add files via upload --- .../library/applications/fake_news_loader.py | 134 +- keras_text_summarization/library/rnn.py | 1610 ++++++++--------- keras_text_summarization/library/seq2seq.py | 1182 ++++++------ .../library/utility/device_utils.py | 44 +- .../library/utility/glove_loader.py | 157 +- .../library/utility/plot_utils.py | 212 +-- 6 files changed, 1669 insertions(+), 1670 deletions(-) diff --git a/keras_text_summarization/library/applications/fake_news_loader.py b/keras_text_summarization/library/applications/fake_news_loader.py index 5439134..cffe22b 100644 --- a/keras_text_summarization/library/applications/fake_news_loader.py +++ b/keras_text_summarization/library/applications/fake_news_loader.py @@ -1,67 +1,67 @@ -from collections import Counter - -MAX_INPUT_SEQ_LENGTH = 500 -MAX_TARGET_SEQ_LENGTH = 50 -MAX_INPUT_VOCAB_SIZE = 5000 -MAX_TARGET_VOCAB_SIZE = 2000 - - -def fit_text(X, Y, input_seq_max_length=None, target_seq_max_length=None): - if input_seq_max_length is None: - input_seq_max_length = MAX_INPUT_SEQ_LENGTH - if target_seq_max_length is None: - target_seq_max_length = MAX_TARGET_SEQ_LENGTH - input_counter = Counter() - target_counter = Counter() - max_input_seq_length = 0 - max_target_seq_length = 0 - - for line in X: - text = [word.lower() for word in line.split(' ')] - seq_length = len(text) - if seq_length > input_seq_max_length: - text = text[0:input_seq_max_length] - seq_length = len(text) - for word in text: - input_counter[word] += 1 - max_input_seq_length = max(max_input_seq_length, seq_length) - - for line in Y: - line2 = 'START ' + line.lower() + ' END' - text = [word for word in line2.split(' ')] - seq_length = len(text) - if seq_length > target_seq_max_length: - text = text[0:target_seq_max_length] - seq_length = len(text) - for word in text: - target_counter[word] += 1 - max_target_seq_length = max(max_target_seq_length, seq_length) - - input_word2idx = dict() - for idx, word in enumerate(input_counter.most_common(MAX_INPUT_VOCAB_SIZE)): - input_word2idx[word[0]] = idx + 2 - input_word2idx['PAD'] = 0 - input_word2idx['UNK'] = 1 - input_idx2word = dict([(idx, word) for word, idx in input_word2idx.items()]) - - target_word2idx = dict() - for idx, word in enumerate(target_counter.most_common(MAX_TARGET_VOCAB_SIZE)): - target_word2idx[word[0]] = idx + 1 - target_word2idx['UNK'] = 0 - - target_idx2word = dict([(idx, word) for word, idx in target_word2idx.items()]) - - num_input_tokens = len(input_word2idx) - num_target_tokens = len(target_word2idx) - - config = dict() - config['input_word2idx'] = input_word2idx - config['input_idx2word'] = input_idx2word - config['target_word2idx'] = target_word2idx - config['target_idx2word'] = target_idx2word - config['num_input_tokens'] = num_input_tokens - config['num_target_tokens'] = num_target_tokens - config['max_input_seq_length'] = max_input_seq_length - config['max_target_seq_length'] = max_target_seq_length - - return config +from collections import Counter + +MAX_INPUT_SEQ_LENGTH = 500 +MAX_TARGET_SEQ_LENGTH = 50 +MAX_INPUT_VOCAB_SIZE = 5000 +MAX_TARGET_VOCAB_SIZE = 2000 + + +def fit_text(X, Y, input_seq_max_length=None, target_seq_max_length=None): + if input_seq_max_length is None: + input_seq_max_length = MAX_INPUT_SEQ_LENGTH + if target_seq_max_length is None: + target_seq_max_length = MAX_TARGET_SEQ_LENGTH + input_counter = Counter() + target_counter = Counter() + max_input_seq_length = 0 + max_target_seq_length = 0 + + for line in X: + text = [word.lower() for word in line.split(' ')] + seq_length = len(text) + if seq_length > input_seq_max_length: + text = text[0:input_seq_max_length] + seq_length = len(text) + for word in text: + input_counter[word] += 1 + max_input_seq_length = max(max_input_seq_length, seq_length) + + for line in Y: + line2 = 'START ' + line.lower() + ' END' + text = [word for word in line2.split(' ')] + seq_length = len(text) + if seq_length > target_seq_max_length: + text = text[0:target_seq_max_length] + seq_length = len(text) + for word in text: + target_counter[word] += 1 + max_target_seq_length = max(max_target_seq_length, seq_length) + + input_word2idx = dict() + for idx, word in enumerate(input_counter.most_common(MAX_INPUT_VOCAB_SIZE)): + input_word2idx[word[0]] = idx + 2 + input_word2idx['PAD'] = 0 + input_word2idx['UNK'] = 1 + input_idx2word = dict([(idx, word) for word, idx in input_word2idx.items()]) + + target_word2idx = dict() + for idx, word in enumerate(target_counter.most_common(MAX_TARGET_VOCAB_SIZE)): + target_word2idx[word[0]] = idx + 1 + target_word2idx['UNK'] = 0 + + target_idx2word = dict([(idx, word) for word, idx in target_word2idx.items()]) + + num_input_tokens = len(input_word2idx) + num_target_tokens = len(target_word2idx) + + config = dict() + config['input_word2idx'] = input_word2idx + config['input_idx2word'] = input_idx2word + config['target_word2idx'] = target_word2idx + config['target_idx2word'] = target_idx2word + config['num_input_tokens'] = num_input_tokens + config['num_target_tokens'] = num_target_tokens + config['max_input_seq_length'] = max_input_seq_length + config['max_target_seq_length'] = max_target_seq_length + + return config diff --git a/keras_text_summarization/library/rnn.py b/keras_text_summarization/library/rnn.py index 54e67c1..f6f365b 100644 --- a/keras_text_summarization/library/rnn.py +++ b/keras_text_summarization/library/rnn.py @@ -1,805 +1,805 @@ -from __future__ import print_function - -from keras.models import Model, Sequential -from keras.layers import Embedding, Dense, Input, RepeatVector, TimeDistributed, concatenate, Merge, add, Dropout -from keras.layers.recurrent import LSTM -from keras.preprocessing.sequence import pad_sequences -from keras.callbacks import ModelCheckpoint -import numpy as np -import os - -HIDDEN_UNITS = 100 -DEFAULT_BATCH_SIZE = 64 -VERBOSE = 1 -DEFAULT_EPOCHS = 10 - - -class OneShotRNN(object): - model_name = 'one-shot-rnn' - """ - The first alternative model is to generate the entire output sequence in a one-shot manner. - That is, the decoder uses the context vector alone to generate the output sequence. - - This model puts a heavy burden on the decoder. - It is likely that the decoder will not have sufficient context for generating a coherent output sequence as it - must choose the words and their order. - """ - - def __init__(self, config): - self.num_input_tokens = config['num_input_tokens'] - self.max_input_seq_length = config['max_input_seq_length'] - self.num_target_tokens = config['num_target_tokens'] - self.max_target_seq_length = config['max_target_seq_length'] - self.input_word2idx = config['input_word2idx'] - self.input_idx2word = config['input_idx2word'] - self.target_word2idx = config['target_word2idx'] - self.target_idx2word = config['target_idx2word'] - self.config = config - self.version = 0 - if 'version' in config: - self.version = config['version'] - - print('max_input_seq_length', self.max_input_seq_length) - print('max_target_seq_length', self.max_target_seq_length) - print('num_input_tokens', self.num_input_tokens) - print('num_target_tokens', self.num_target_tokens) - - # encoder input model - model = Sequential() - model.add(Embedding(output_dim=128, input_dim=self.num_input_tokens, input_length=self.max_input_seq_length)) - - # encoder model - model.add(LSTM(128)) - model.add(RepeatVector(self.max_target_seq_length)) - # decoder model - model.add(LSTM(128, return_sequences=True)) - model.add(TimeDistributed(Dense(self.num_target_tokens, activation='softmax'))) - - model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) - - self.model = model - - def load_weights(self, weight_file_path): - if os.path.exists(weight_file_path): - self.model.load_weights(weight_file_path) - - def transform_input_text(self, texts): - temp = [] - for line in texts: - x = [] - for word in line.lower().split(' '): - wid = 1 - if word in self.input_word2idx: - wid = self.input_word2idx[word] - x.append(wid) - if len(x) >= self.max_input_seq_length: - break - temp.append(x) - temp = pad_sequences(temp, maxlen=self.max_input_seq_length) - - print(temp.shape) - return temp - - def transform_target_encoding(self, texts): - temp = [] - for line in texts: - x = [] - line2 = 'START ' + line.lower() + ' END' - for word in line2.split(' '): - x.append(word) - if len(x) >= self.max_target_seq_length: - break - temp.append(x) - - temp = np.array(temp) - print(temp.shape) - return temp - - def generate_batch(self, x_samples, y_samples, batch_size): - num_batches = len(x_samples) // batch_size - while True: - for batchIdx in range(0, num_batches): - start = batchIdx * batch_size - end = (batchIdx + 1) * batch_size - encoder_input_data_batch = pad_sequences(x_samples[start:end], self.max_input_seq_length) - decoder_target_data_batch = np.zeros( - shape=(batch_size, self.max_target_seq_length, self.num_target_tokens)) - for lineIdx, target_words in enumerate(y_samples[start:end]): - for idx, w in enumerate(target_words): - w2idx = 0 # default [UNK] - if w in self.target_word2idx: - w2idx = self.target_word2idx[w] - if w2idx != 0: - decoder_target_data_batch[lineIdx, idx, w2idx] = 1 - yield encoder_input_data_batch, decoder_target_data_batch - - @staticmethod - def get_weight_file_path(model_dir_path): - return model_dir_path + '/' + OneShotRNN.model_name + '-weights.h5' - - @staticmethod - def get_config_file_path(model_dir_path): - return model_dir_path + '/' + OneShotRNN.model_name + '-config.npy' - - @staticmethod - def get_architecture_file_path(model_dir_path): - return model_dir_path + '/' + OneShotRNN.model_name + '-architecture.json' - - def fit(self, Xtrain, Ytrain, Xtest, Ytest, epochs=None, model_dir_path=None, batch_size=None): - if epochs is None: - epochs = DEFAULT_EPOCHS - if model_dir_path is None: - model_dir_path = './models' - if batch_size is None: - batch_size = DEFAULT_BATCH_SIZE - - self.version += 1 - self.config['version'] = self.version - - config_file_path = OneShotRNN.get_config_file_path(model_dir_path) - weight_file_path = OneShotRNN.get_weight_file_path(model_dir_path) - checkpoint = ModelCheckpoint(weight_file_path) - np.save(config_file_path, self.config) - architecture_file_path = OneShotRNN.get_architecture_file_path(model_dir_path) - open(architecture_file_path, 'w').write(self.model.to_json()) - - Ytrain = self.transform_target_encoding(Ytrain) - Ytest = self.transform_target_encoding(Ytest) - - Xtrain = self.transform_input_text(Xtrain) - Xtest = self.transform_input_text(Xtest) - - train_gen = self.generate_batch(Xtrain, Ytrain, batch_size) - test_gen = self.generate_batch(Xtest, Ytest, batch_size) - - train_num_batches = len(Xtrain) // batch_size - test_num_batches = len(Xtest) // batch_size - - history = self.model.fit_generator(generator=train_gen, steps_per_epoch=train_num_batches, - epochs=epochs, - verbose=VERBOSE, validation_data=test_gen, validation_steps=test_num_batches, - callbacks=[checkpoint]) - self.model.save_weights(weight_file_path) - return history - - def summarize(self, input_text): - input_seq = [] - input_wids = [] - for word in input_text.lower().split(' '): - idx = 1 # default [UNK] - if word in self.input_word2idx: - idx = self.input_word2idx[word] - input_wids.append(idx) - input_seq.append(input_wids) - input_seq = pad_sequences(input_seq, self.max_input_seq_length) - predicted = self.model.predict(input_seq) - predicted_word_idx_list = np.argmax(predicted, axis=1) - predicted_word_list = [self.target_idx2word[wid] for wid in predicted_word_idx_list[0]] - return predicted_word_list - - -class RecursiveRNN1(object): - model_name = 'recursive-rnn-1' - """ - A second alternative model is to develop a model that generates a single word forecast and call it recursively. - - That is, the decoder uses the context vector and the distributed representation of all words generated so far as - input in order to generate the next word. - - A language model can be used to interpret the sequence of words generated so far to provide a second context vector - to combine with the representation of the source document in order to generate the next word in the sequence. - - The summary is built up by recursively calling the model with the previously generated word appended (or, more - specifically, the expected previous word during training). - - The context vectors could be concentrated or added together to provide a broader context for the decoder to - interpret and output the next word. - """ - - def __init__(self, config): - self.num_input_tokens = config['num_input_tokens'] - self.max_input_seq_length = config['max_input_seq_length'] - self.num_target_tokens = config['num_target_tokens'] - self.max_target_seq_length = config['max_target_seq_length'] - self.input_word2idx = config['input_word2idx'] - self.input_idx2word = config['input_idx2word'] - self.target_word2idx = config['target_word2idx'] - self.target_idx2word = config['target_idx2word'] - if 'version' in config: - self.version = config['version'] - else: - self.version = 0 - self.config = config - - print('max_input_seq_length', self.max_input_seq_length) - print('max_target_seq_length', self.max_target_seq_length) - print('num_input_tokens', self.num_input_tokens) - print('num_target_tokens', self.num_target_tokens) - - inputs1 = Input(shape=(self.max_input_seq_length,)) - am1 = Embedding(self.num_input_tokens, 128)(inputs1) - am2 = LSTM(128)(am1) - - inputs2 = Input(shape=(self.max_target_seq_length,)) - sm1 = Embedding(self.num_target_tokens, 128)(inputs2) - sm2 = LSTM(128)(sm1) - - decoder1 = concatenate([am2, sm2]) - outputs = Dense(self.num_target_tokens, activation='softmax')(decoder1) - - model = Model(inputs=[inputs1, inputs2], outputs=outputs) - - model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) - self.model = model - - def load_weights(self, weight_file_path): - if os.path.exists(weight_file_path): - self.model.load_weights(weight_file_path) - - def transform_input_text(self, texts): - temp = [] - for line in texts: - x = [] - for word in line.lower().split(' '): - wid = 1 - if word in self.input_word2idx: - wid = self.input_word2idx[word] - x.append(wid) - if len(x) >= self.max_input_seq_length: - break - temp.append(x) - temp = pad_sequences(temp, maxlen=self.max_input_seq_length) - - print(temp.shape) - return temp - - def split_target_text(self, texts): - temp = [] - for line in texts: - x = [] - line2 = 'START ' + line.lower() + ' END' - for word in line2.split(' '): - x.append(word) - if len(x)+1 >= self.max_target_seq_length: - x.append('END') - break - temp.append(x) - return temp - - def generate_batch(self, x_samples, y_samples, batch_size): - encoder_input_data_batch = [] - decoder_input_data_batch = [] - decoder_target_data_batch = [] - line_idx = 0 - while True: - for recordIdx in range(0, len(x_samples)): - target_words = y_samples[recordIdx] - x = x_samples[recordIdx] - decoder_input_line = [] - - for idx in range(0, len(target_words)-1): - w2idx = 0 # default [UNK] - w = target_words[idx] - if w in self.target_word2idx: - w2idx = self.target_word2idx[w] - decoder_input_line = decoder_input_line + [w2idx] - decoder_target_label = np.zeros(self.num_target_tokens) - w2idx_next = 0 - if target_words[idx+1] in self.target_word2idx: - w2idx_next = self.target_word2idx[target_words[idx+1]] - if w2idx_next != 0: - decoder_target_label[w2idx_next] = 1 - decoder_input_data_batch.append(decoder_input_line) - encoder_input_data_batch.append(x) - decoder_target_data_batch.append(decoder_target_label) - - line_idx += 1 - if line_idx >= batch_size: - yield [pad_sequences(encoder_input_data_batch, self.max_input_seq_length), - pad_sequences(decoder_input_data_batch, - self.max_target_seq_length)], np.array(decoder_target_data_batch) - line_idx = 0 - encoder_input_data_batch = [] - decoder_input_data_batch = [] - decoder_target_data_batch = [] - - @staticmethod - def get_weight_file_path(model_dir_path): - return model_dir_path + '/' + RecursiveRNN1.model_name + '-weights.h5' - - @staticmethod - def get_config_file_path(model_dir_path): - return model_dir_path + '/' + RecursiveRNN1.model_name + '-config.npy' - - @staticmethod - def get_architecture_file_path(model_dir_path): - return model_dir_path + '/' + RecursiveRNN1.model_name + '-architecture.json' - - def fit(self, Xtrain, Ytrain, Xtest, Ytest, epochs=None, model_dir_path=None, batch_size=None): - if epochs is None: - epochs = DEFAULT_EPOCHS - if model_dir_path is None: - model_dir_path = './models' - if batch_size is None: - batch_size = DEFAULT_BATCH_SIZE - - self.version += 1 - self.config['version'] = self.version - - config_file_path = RecursiveRNN1.get_config_file_path(model_dir_path) - weight_file_path = RecursiveRNN1.get_weight_file_path(model_dir_path) - checkpoint = ModelCheckpoint(weight_file_path) - np.save(config_file_path, self.config) - architecture_file_path = RecursiveRNN1.get_architecture_file_path(model_dir_path) - open(architecture_file_path, 'w').write(self.model.to_json()) - - Ytrain = self.split_target_text(Ytrain) - Ytest = self.split_target_text(Ytest) - - Xtrain = self.transform_input_text(Xtrain) - Xtest = self.transform_input_text(Xtest) - - train_gen = self.generate_batch(Xtrain, Ytrain, batch_size) - test_gen = self.generate_batch(Xtest, Ytest, batch_size) - - total_training_samples = sum([len(target_text)-1 for target_text in Ytrain]) - total_testing_samples = sum([len(target_text)-1 for target_text in Ytest]) - train_num_batches = total_training_samples // batch_size - test_num_batches = total_testing_samples // batch_size - - history = self.model.fit_generator(generator=train_gen, steps_per_epoch=train_num_batches, - epochs=epochs, - verbose=VERBOSE, validation_data=test_gen, validation_steps=test_num_batches, - callbacks=[checkpoint]) - self.model.save_weights(weight_file_path) - return history - - def summarize(self, input_text): - input_seq = [] - input_wids = [] - for word in input_text.lower().split(' '): - idx = 1 # default [UNK] - if word in self.input_word2idx: - idx = self.input_word2idx[word] - input_wids.append(idx) - input_seq.append(input_wids) - input_seq = pad_sequences(input_seq, self.max_input_seq_length) - start_token = self.target_word2idx['START'] - wid_list = [start_token] - sum_input_seq = pad_sequences([wid_list], self.max_target_seq_length) - terminated = False - - target_text = '' - - while not terminated: - output_tokens = self.model.predict([input_seq, sum_input_seq]) - sample_token_idx = np.argmax(output_tokens[0, :]) - sample_word = self.target_idx2word[sample_token_idx] - wid_list = wid_list + [sample_token_idx] - - if sample_word != 'START' and sample_word != 'END': - target_text += ' ' + sample_word - - if sample_word == 'END' or len(wid_list) >= self.max_target_seq_length: - terminated = True - else: - sum_input_seq = pad_sequences([wid_list], self.max_target_seq_length) - return target_text.strip() - - -class RecursiveRNN2(object): - model_name = 'recursive-rnn-2' - """ - In this third alternative, the Encoder generates a context vector representation of the source document. - - This document is fed to the decoder at each step of the generated output sequence. This allows the decoder to build - up the same internal state as was used to generate the words in the output sequence so that it is primed to generate - the next word in the sequence. - - This process is then repeated by calling the model again and again for each word in the output sequence until a - maximum length or end-of-sequence token is generated. - """ - - MAX_DECODER_SEQ_LENGTH = 4 - - def __init__(self, config): - self.num_input_tokens = config['num_input_tokens'] - self.max_input_seq_length = config['max_input_seq_length'] - self.num_target_tokens = config['num_target_tokens'] - self.max_target_seq_length = config['max_target_seq_length'] - self.input_word2idx = config['input_word2idx'] - self.input_idx2word = config['input_idx2word'] - self.target_word2idx = config['target_word2idx'] - self.target_idx2word = config['target_idx2word'] - self.config = config - - self.version = 0 - if 'version' in config: - self.version = config['version'] - - # article input model - inputs1 = Input(shape=(self.max_input_seq_length,)) - article1 = Embedding(self.num_input_tokens, 128)(inputs1) - article2 = Dropout(0.3)(article1) - - # summary input model - inputs2 = Input(shape=(min(self.num_target_tokens, RecursiveRNN2.MAX_DECODER_SEQ_LENGTH), )) - summ1 = Embedding(self.num_target_tokens, 128)(inputs2) - summ2 = Dropout(0.3)(summ1) - summ3 = LSTM(128)(summ2) - summ4 = RepeatVector(self.max_input_seq_length)(summ3) - - # decoder model - decoder1 = concatenate([article2, summ4]) - decoder2 = LSTM(128)(decoder1) - outputs = Dense(self.num_target_tokens, activation='softmax')(decoder2) - # tie it together [article, summary] [word] - model = Model(inputs=[inputs1, inputs2], outputs=outputs) - model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) - - print(model.summary()) - - self.model = model - - def load_weights(self, weight_file_path): - if os.path.exists(weight_file_path): - print('loading weights from ', weight_file_path) - self.model.load_weights(weight_file_path) - - def transform_input_text(self, texts): - temp = [] - for line in texts: - x = [] - for word in line.lower().split(' '): - wid = 1 - if word in self.input_word2idx: - wid = self.input_word2idx[word] - x.append(wid) - if len(x) >= self.max_input_seq_length: - break - temp.append(x) - temp = pad_sequences(temp, maxlen=self.max_input_seq_length) - - print(temp.shape) - return temp - - def split_target_text(self, texts): - temp = [] - for line in texts: - x = [] - line2 = 'START ' + line.lower() + ' END' - for word in line2.split(' '): - x.append(word) - if len(x)+1 >= self.max_target_seq_length: - x.append('END') - break - temp.append(x) - return temp - - def generate_batch(self, x_samples, y_samples, batch_size): - encoder_input_data_batch = [] - decoder_input_data_batch = [] - decoder_target_data_batch = [] - line_idx = 0 - while True: - for recordIdx in range(0, len(x_samples)): - target_words = y_samples[recordIdx] - x = x_samples[recordIdx] - decoder_input_line = [] - - for idx in range(0, len(target_words)-1): - w2idx = 0 # default [UNK] - w = target_words[idx] - if w in self.target_word2idx: - w2idx = self.target_word2idx[w] - decoder_input_line = decoder_input_line + [w2idx] - decoder_target_label = np.zeros(self.num_target_tokens) - w2idx_next = 0 - if target_words[idx+1] in self.target_word2idx: - w2idx_next = self.target_word2idx[target_words[idx+1]] - if w2idx_next != 0: - decoder_target_label[w2idx_next] = 1 - - decoder_input_data_batch.append(decoder_input_line) - encoder_input_data_batch.append(x) - decoder_target_data_batch.append(decoder_target_label) - - line_idx += 1 - if line_idx >= batch_size: - yield [pad_sequences(encoder_input_data_batch, self.max_input_seq_length), - pad_sequences(decoder_input_data_batch, - min(self.num_target_tokens, RecursiveRNN2.MAX_DECODER_SEQ_LENGTH))], np.array(decoder_target_data_batch) - line_idx = 0 - encoder_input_data_batch = [] - decoder_input_data_batch = [] - decoder_target_data_batch = [] - - @staticmethod - def get_weight_file_path(model_dir_path): - return model_dir_path + '/' + RecursiveRNN2.model_name + '-weights.h5' - - @staticmethod - def get_config_file_path(model_dir_path): - return model_dir_path + '/' + RecursiveRNN2.model_name + '-config.npy' - - @staticmethod - def get_architecture_file_path(model_dir_path): - return model_dir_path + '/' + RecursiveRNN2.model_name + '-architecture.json' - - def fit(self, Xtrain, Ytrain, Xtest, Ytest, epochs=None, model_dir_path=None, batch_size=None): - if epochs is None: - epochs = DEFAULT_EPOCHS - if model_dir_path is None: - model_dir_path = './models' - if batch_size is None: - batch_size = DEFAULT_BATCH_SIZE - - self.version += 1 - self.config['version'] = self.version - - config_file_path = RecursiveRNN2.get_config_file_path(model_dir_path) - weight_file_path = RecursiveRNN2.get_weight_file_path(model_dir_path) - checkpoint = ModelCheckpoint(weight_file_path) - np.save(config_file_path, self.config) - architecture_file_path = RecursiveRNN2.get_architecture_file_path(model_dir_path) - open(architecture_file_path, 'w').write(self.model.to_json()) - - Ytrain = self.split_target_text(Ytrain) - Ytest = self.split_target_text(Ytest) - - Xtrain = self.transform_input_text(Xtrain) - Xtest = self.transform_input_text(Xtest) - - train_gen = self.generate_batch(Xtrain, Ytrain, batch_size) - test_gen = self.generate_batch(Xtest, Ytest, batch_size) - - total_training_samples = sum([len(target_text)-1 for target_text in Ytrain]) - total_testing_samples = sum([len(target_text)-1 for target_text in Ytest]) - train_num_batches = total_training_samples // batch_size - test_num_batches = total_testing_samples // batch_size - - history = self.model.fit_generator(generator=train_gen, steps_per_epoch=train_num_batches, - epochs=epochs, - verbose=VERBOSE, validation_data=test_gen, validation_steps=test_num_batches, - callbacks=[checkpoint]) - self.model.save_weights(weight_file_path) - return history - - def summarize(self, input_text): - input_seq = [] - input_wids = [] - for word in input_text.lower().split(' '): - idx = 1 # default [UNK] - if word in self.input_word2idx: - idx = self.input_word2idx[word] - input_wids.append(idx) - input_seq.append(input_wids) - input_seq = pad_sequences(input_seq, self.max_input_seq_length) - start_token = self.target_word2idx['START'] - wid_list = [start_token] - sum_input_seq = pad_sequences([wid_list], min(self.num_target_tokens, RecursiveRNN2.MAX_DECODER_SEQ_LENGTH)) - terminated = False - - target_text = '' - - while not terminated: - output_tokens = self.model.predict([input_seq, sum_input_seq]) - sample_token_idx = np.argmax(output_tokens[0, :]) - sample_word = self.target_idx2word[sample_token_idx] - wid_list = wid_list + [sample_token_idx] - - if sample_word != 'START' and sample_word != 'END': - target_text += ' ' + sample_word - - if sample_word == 'END' or len(wid_list) >= self.max_target_seq_length: - terminated = True - else: - sum_input_seq = pad_sequences([wid_list], min(self.num_target_tokens, RecursiveRNN2.MAX_DECODER_SEQ_LENGTH)) - return target_text.strip() - - -class RecursiveRNN3(object): - model_name = 'recursive-rnn-3' - """ - In this third alternative, the Encoder generates a context vector representation of the source document. - - This document is fed to the decoder at each step of the generated output sequence. This allows the decoder to build - up the same internal state as was used to generate the words in the output sequence so that it is primed to generate - the next word in the sequence. - - This process is then repeated by calling the model again and again for each word in the output sequence until a - maximum length or end-of-sequence token is generated. - """ - - def __init__(self, config): - self.num_input_tokens = config['num_input_tokens'] - self.max_input_seq_length = config['max_input_seq_length'] - self.num_target_tokens = config['num_target_tokens'] - self.max_target_seq_length = config['max_target_seq_length'] - self.input_word2idx = config['input_word2idx'] - self.input_idx2word = config['input_idx2word'] - self.target_word2idx = config['target_word2idx'] - self.target_idx2word = config['target_idx2word'] - self.config = config - - self.version = 0 - if 'version' in config: - self.version = config['version'] - - # article input model - inputs1 = Input(shape=(self.max_input_seq_length,)) - article1 = Embedding(self.num_input_tokens, 128)(inputs1) - article2 = LSTM(128)(article1) - article3 = RepeatVector(128)(article2) - # summary input model - inputs2 = Input(shape=(self.max_target_seq_length,)) - summ1 = Embedding(self.num_target_tokens, 128)(inputs2) - summ2 = LSTM(128)(summ1) - summ3 = RepeatVector(128)(summ2) - # decoder model - decoder1 = concatenate([article3, summ3]) - decoder2 = LSTM(128)(decoder1) - outputs = Dense(self.num_target_tokens, activation='softmax')(decoder2) - # tie it together [article, summary] [word] - model = Model(inputs=[inputs1, inputs2], outputs=outputs) - model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) - - print(model.summary()) - - self.model = model - - def load_weights(self, weight_file_path): - if os.path.exists(weight_file_path): - print('loading weights from ', weight_file_path) - self.model.load_weights(weight_file_path) - - def transform_input_text(self, texts): - temp = [] - for line in texts: - x = [] - for word in line.lower().split(' '): - wid = 1 - if word in self.input_word2idx: - wid = self.input_word2idx[word] - x.append(wid) - if len(x) >= self.max_input_seq_length: - break - temp.append(x) - temp = pad_sequences(temp, maxlen=self.max_input_seq_length) - - print(temp.shape) - return temp - - def split_target_text(self, texts): - temp = [] - for line in texts: - x = [] - line2 = 'START ' + line.lower() + ' END' - for word in line2.split(' '): - x.append(word) - if len(x)+1 >= self.max_target_seq_length: - x.append('END') - break - temp.append(x) - return temp - - def generate_batch(self, x_samples, y_samples, batch_size): - encoder_input_data_batch = [] - decoder_input_data_batch = [] - decoder_target_data_batch = [] - line_idx = 0 - while True: - for recordIdx in range(0, len(x_samples)): - target_words = y_samples[recordIdx] - x = x_samples[recordIdx] - decoder_input_line = [] - - for idx in range(0, len(target_words)-1): - w2idx = 0 # default [UNK] - w = target_words[idx] - if w in self.target_word2idx: - w2idx = self.target_word2idx[w] - decoder_input_line = decoder_input_line + [w2idx] - decoder_target_label = np.zeros(self.num_target_tokens) - w2idx_next = 0 - if target_words[idx+1] in self.target_word2idx: - w2idx_next = self.target_word2idx[target_words[idx+1]] - if w2idx_next != 0: - decoder_target_label[w2idx_next] = 1 - - decoder_input_data_batch.append(decoder_input_line) - encoder_input_data_batch.append(x) - decoder_target_data_batch.append(decoder_target_label) - - line_idx += 1 - if line_idx >= batch_size: - yield [pad_sequences(encoder_input_data_batch, self.max_input_seq_length), - pad_sequences(decoder_input_data_batch, - self.max_target_seq_length)], np.array(decoder_target_data_batch) - line_idx = 0 - encoder_input_data_batch = [] - decoder_input_data_batch = [] - decoder_target_data_batch = [] - - @staticmethod - def get_weight_file_path(model_dir_path): - return model_dir_path + '/' + RecursiveRNN2.model_name + '-weights.h5' - - @staticmethod - def get_config_file_path(model_dir_path): - return model_dir_path + '/' + RecursiveRNN2.model_name + '-config.npy' - - @staticmethod - def get_architecture_file_path(model_dir_path): - return model_dir_path + '/' + RecursiveRNN2.model_name + '-architecture.json' - - def fit(self, Xtrain, Ytrain, Xtest, Ytest, epochs=None, model_dir_path=None, batch_size=None): - if epochs is None: - epochs = DEFAULT_EPOCHS - if model_dir_path is None: - model_dir_path = './models' - if batch_size is None: - batch_size = DEFAULT_BATCH_SIZE - - self.version += 1 - self.config['version'] = self.version - - config_file_path = RecursiveRNN2.get_config_file_path(model_dir_path) - weight_file_path = RecursiveRNN2.get_weight_file_path(model_dir_path) - checkpoint = ModelCheckpoint(weight_file_path) - np.save(config_file_path, self.config) - architecture_file_path = RecursiveRNN2.get_architecture_file_path(model_dir_path) - open(architecture_file_path, 'w').write(self.model.to_json()) - - Ytrain = self.split_target_text(Ytrain) - Ytest = self.split_target_text(Ytest) - - Xtrain = self.transform_input_text(Xtrain) - Xtest = self.transform_input_text(Xtest) - - train_gen = self.generate_batch(Xtrain, Ytrain, batch_size) - test_gen = self.generate_batch(Xtest, Ytest, batch_size) - - total_training_samples = sum([len(target_text)-1 for target_text in Ytrain]) - total_testing_samples = sum([len(target_text)-1 for target_text in Ytest]) - train_num_batches = total_training_samples // batch_size - test_num_batches = total_testing_samples // batch_size - - history = self.model.fit_generator(generator=train_gen, steps_per_epoch=train_num_batches, - epochs=epochs, - verbose=VERBOSE, validation_data=test_gen, validation_steps=test_num_batches, - callbacks=[checkpoint]) - self.model.save_weights(weight_file_path) - return history - - def summarize(self, input_text): - input_seq = [] - input_wids = [] - for word in input_text.lower().split(' '): - idx = 1 # default [UNK] - if word in self.input_word2idx: - idx = self.input_word2idx[word] - input_wids.append(idx) - input_seq.append(input_wids) - input_seq = pad_sequences(input_seq, self.max_input_seq_length) - start_token = self.target_word2idx['START'] - wid_list = [start_token] - sum_input_seq = pad_sequences([wid_list], self.max_target_seq_length) - terminated = False - - target_text = '' - - while not terminated: - output_tokens = self.model.predict([input_seq, sum_input_seq]) - sample_token_idx = np.argmax(output_tokens[0, :]) - sample_word = self.target_idx2word[sample_token_idx] - wid_list = wid_list + [sample_token_idx] - - if sample_word != 'START' and sample_word != 'END': - target_text += ' ' + sample_word - - if sample_word == 'END' or len(wid_list) >= self.max_target_seq_length: - terminated = True - else: - sum_input_seq = pad_sequences([wid_list], self.max_target_seq_length) - return target_text.strip() +from __future__ import print_function + +from keras.models import Model, Sequential +from keras.layers import Embedding, Dense, Input, RepeatVector, TimeDistributed, concatenate, Merge, add, Dropout +from keras.layers.recurrent import LSTM +from keras.preprocessing.sequence import pad_sequences +from keras.callbacks import ModelCheckpoint +import numpy as np +import os + +HIDDEN_UNITS = 100 +DEFAULT_BATCH_SIZE = 64 +VERBOSE = 1 +DEFAULT_EPOCHS = 10 + + +class OneShotRNN(object): + model_name = 'one-shot-rnn' + """ + The first alternative model is to generate the entire output sequence in a one-shot manner. + That is, the decoder uses the context vector alone to generate the output sequence. + + This model puts a heavy burden on the decoder. + It is likely that the decoder will not have sufficient context for generating a coherent output sequence as it + must choose the words and their order. + """ + + def __init__(self, config): + self.num_input_tokens = config['num_input_tokens'] + self.max_input_seq_length = config['max_input_seq_length'] + self.num_target_tokens = config['num_target_tokens'] + self.max_target_seq_length = config['max_target_seq_length'] + self.input_word2idx = config['input_word2idx'] + self.input_idx2word = config['input_idx2word'] + self.target_word2idx = config['target_word2idx'] + self.target_idx2word = config['target_idx2word'] + self.config = config + self.version = 0 + if 'version' in config: + self.version = config['version'] + + print('max_input_seq_length', self.max_input_seq_length) + print('max_target_seq_length', self.max_target_seq_length) + print('num_input_tokens', self.num_input_tokens) + print('num_target_tokens', self.num_target_tokens) + + # encoder input model + model = Sequential() + model.add(Embedding(output_dim=128, input_dim=self.num_input_tokens, input_length=self.max_input_seq_length)) + + # encoder model + model.add(LSTM(128)) + model.add(RepeatVector(self.max_target_seq_length)) + # decoder model + model.add(LSTM(128, return_sequences=True)) + model.add(TimeDistributed(Dense(self.num_target_tokens, activation='softmax'))) + + model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) + + self.model = model + + def load_weights(self, weight_file_path): + if os.path.exists(weight_file_path): + self.model.load_weights(weight_file_path) + + def transform_input_text(self, texts): + temp = [] + for line in texts: + x = [] + for word in line.lower().split(' '): + wid = 1 + if word in self.input_word2idx: + wid = self.input_word2idx[word] + x.append(wid) + if len(x) >= self.max_input_seq_length: + break + temp.append(x) + temp = pad_sequences(temp, maxlen=self.max_input_seq_length) + + print(temp.shape) + return temp + + def transform_target_encoding(self, texts): + temp = [] + for line in texts: + x = [] + line2 = 'START ' + line.lower() + ' END' + for word in line2.split(' '): + x.append(word) + if len(x) >= self.max_target_seq_length: + break + temp.append(x) + + temp = np.array(temp) + print(temp.shape) + return temp + + def generate_batch(self, x_samples, y_samples, batch_size): + num_batches = len(x_samples) // batch_size + while True: + for batchIdx in range(0, num_batches): + start = batchIdx * batch_size + end = (batchIdx + 1) * batch_size + encoder_input_data_batch = pad_sequences(x_samples[start:end], self.max_input_seq_length) + decoder_target_data_batch = np.zeros( + shape=(batch_size, self.max_target_seq_length, self.num_target_tokens)) + for lineIdx, target_words in enumerate(y_samples[start:end]): + for idx, w in enumerate(target_words): + w2idx = 0 # default [UNK] + if w in self.target_word2idx: + w2idx = self.target_word2idx[w] + if w2idx != 0: + decoder_target_data_batch[lineIdx, idx, w2idx] = 1 + yield encoder_input_data_batch, decoder_target_data_batch + + @staticmethod + def get_weight_file_path(model_dir_path): + return model_dir_path + '/' + OneShotRNN.model_name + '-weights.h5' + + @staticmethod + def get_config_file_path(model_dir_path): + return model_dir_path + '/' + OneShotRNN.model_name + '-config.npy' + + @staticmethod + def get_architecture_file_path(model_dir_path): + return model_dir_path + '/' + OneShotRNN.model_name + '-architecture.json' + + def fit(self, Xtrain, Ytrain, Xtest, Ytest, epochs=None, model_dir_path=None, batch_size=None): + if epochs is None: + epochs = DEFAULT_EPOCHS + if model_dir_path is None: + model_dir_path = './models' + if batch_size is None: + batch_size = DEFAULT_BATCH_SIZE + + self.version += 1 + self.config['version'] = self.version + + config_file_path = OneShotRNN.get_config_file_path(model_dir_path) + weight_file_path = OneShotRNN.get_weight_file_path(model_dir_path) + checkpoint = ModelCheckpoint(weight_file_path) + np.save(config_file_path, self.config) + architecture_file_path = OneShotRNN.get_architecture_file_path(model_dir_path) + open(architecture_file_path, 'w').write(self.model.to_json()) + + Ytrain = self.transform_target_encoding(Ytrain) + Ytest = self.transform_target_encoding(Ytest) + + Xtrain = self.transform_input_text(Xtrain) + Xtest = self.transform_input_text(Xtest) + + train_gen = self.generate_batch(Xtrain, Ytrain, batch_size) + test_gen = self.generate_batch(Xtest, Ytest, batch_size) + + train_num_batches = len(Xtrain) // batch_size + test_num_batches = len(Xtest) // batch_size + + history = self.model.fit_generator(generator=train_gen, steps_per_epoch=train_num_batches, + epochs=epochs, + verbose=VERBOSE, validation_data=test_gen, validation_steps=test_num_batches, + callbacks=[checkpoint]) + self.model.save_weights(weight_file_path) + return history + + def summarize(self, input_text): + input_seq = [] + input_wids = [] + for word in input_text.lower().split(' '): + idx = 1 # default [UNK] + if word in self.input_word2idx: + idx = self.input_word2idx[word] + input_wids.append(idx) + input_seq.append(input_wids) + input_seq = pad_sequences(input_seq, self.max_input_seq_length) + predicted = self.model.predict(input_seq) + predicted_word_idx_list = np.argmax(predicted, axis=1) + predicted_word_list = [self.target_idx2word[wid] for wid in predicted_word_idx_list[0]] + return predicted_word_list + + +class RecursiveRNN1(object): + model_name = 'recursive-rnn-1' + """ + A second alternative model is to develop a model that generates a single word forecast and call it recursively. + + That is, the decoder uses the context vector and the distributed representation of all words generated so far as + input in order to generate the next word. + + A language model can be used to interpret the sequence of words generated so far to provide a second context vector + to combine with the representation of the source document in order to generate the next word in the sequence. + + The summary is built up by recursively calling the model with the previously generated word appended (or, more + specifically, the expected previous word during training). + + The context vectors could be concentrated or added together to provide a broader context for the decoder to + interpret and output the next word. + """ + + def __init__(self, config): + self.num_input_tokens = config['num_input_tokens'] + self.max_input_seq_length = config['max_input_seq_length'] + self.num_target_tokens = config['num_target_tokens'] + self.max_target_seq_length = config['max_target_seq_length'] + self.input_word2idx = config['input_word2idx'] + self.input_idx2word = config['input_idx2word'] + self.target_word2idx = config['target_word2idx'] + self.target_idx2word = config['target_idx2word'] + if 'version' in config: + self.version = config['version'] + else: + self.version = 0 + self.config = config + + print('max_input_seq_length', self.max_input_seq_length) + print('max_target_seq_length', self.max_target_seq_length) + print('num_input_tokens', self.num_input_tokens) + print('num_target_tokens', self.num_target_tokens) + + inputs1 = Input(shape=(self.max_input_seq_length,)) + am1 = Embedding(self.num_input_tokens, 128)(inputs1) + am2 = LSTM(128)(am1) + + inputs2 = Input(shape=(self.max_target_seq_length,)) + sm1 = Embedding(self.num_target_tokens, 128)(inputs2) + sm2 = LSTM(128)(sm1) + + decoder1 = concatenate([am2, sm2]) + outputs = Dense(self.num_target_tokens, activation='softmax')(decoder1) + + model = Model(inputs=[inputs1, inputs2], outputs=outputs) + + model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) + self.model = model + + def load_weights(self, weight_file_path): + if os.path.exists(weight_file_path): + self.model.load_weights(weight_file_path) + + def transform_input_text(self, texts): + temp = [] + for line in texts: + x = [] + for word in line.lower().split(' '): + wid = 1 + if word in self.input_word2idx: + wid = self.input_word2idx[word] + x.append(wid) + if len(x) >= self.max_input_seq_length: + break + temp.append(x) + temp = pad_sequences(temp, maxlen=self.max_input_seq_length) + + print(temp.shape) + return temp + + def split_target_text(self, texts): + temp = [] + for line in texts: + x = [] + line2 = 'START ' + line.lower() + ' END' + for word in line2.split(' '): + x.append(word) + if len(x)+1 >= self.max_target_seq_length: + x.append('END') + break + temp.append(x) + return temp + + def generate_batch(self, x_samples, y_samples, batch_size): + encoder_input_data_batch = [] + decoder_input_data_batch = [] + decoder_target_data_batch = [] + line_idx = 0 + while True: + for recordIdx in range(0, len(x_samples)): + target_words = y_samples[recordIdx] + x = x_samples[recordIdx] + decoder_input_line = [] + + for idx in range(0, len(target_words)-1): + w2idx = 0 # default [UNK] + w = target_words[idx] + if w in self.target_word2idx: + w2idx = self.target_word2idx[w] + decoder_input_line = decoder_input_line + [w2idx] + decoder_target_label = np.zeros(self.num_target_tokens) + w2idx_next = 0 + if target_words[idx+1] in self.target_word2idx: + w2idx_next = self.target_word2idx[target_words[idx+1]] + if w2idx_next != 0: + decoder_target_label[w2idx_next] = 1 + decoder_input_data_batch.append(decoder_input_line) + encoder_input_data_batch.append(x) + decoder_target_data_batch.append(decoder_target_label) + + line_idx += 1 + if line_idx >= batch_size: + yield [pad_sequences(encoder_input_data_batch, self.max_input_seq_length), + pad_sequences(decoder_input_data_batch, + self.max_target_seq_length)], np.array(decoder_target_data_batch) + line_idx = 0 + encoder_input_data_batch = [] + decoder_input_data_batch = [] + decoder_target_data_batch = [] + + @staticmethod + def get_weight_file_path(model_dir_path): + return model_dir_path + '/' + RecursiveRNN1.model_name + '-weights.h5' + + @staticmethod + def get_config_file_path(model_dir_path): + return model_dir_path + '/' + RecursiveRNN1.model_name + '-config.npy' + + @staticmethod + def get_architecture_file_path(model_dir_path): + return model_dir_path + '/' + RecursiveRNN1.model_name + '-architecture.json' + + def fit(self, Xtrain, Ytrain, Xtest, Ytest, epochs=None, model_dir_path=None, batch_size=None): + if epochs is None: + epochs = DEFAULT_EPOCHS + if model_dir_path is None: + model_dir_path = './models' + if batch_size is None: + batch_size = DEFAULT_BATCH_SIZE + + self.version += 1 + self.config['version'] = self.version + + config_file_path = RecursiveRNN1.get_config_file_path(model_dir_path) + weight_file_path = RecursiveRNN1.get_weight_file_path(model_dir_path) + checkpoint = ModelCheckpoint(weight_file_path) + np.save(config_file_path, self.config) + architecture_file_path = RecursiveRNN1.get_architecture_file_path(model_dir_path) + open(architecture_file_path, 'w').write(self.model.to_json()) + + Ytrain = self.split_target_text(Ytrain) + Ytest = self.split_target_text(Ytest) + + Xtrain = self.transform_input_text(Xtrain) + Xtest = self.transform_input_text(Xtest) + + train_gen = self.generate_batch(Xtrain, Ytrain, batch_size) + test_gen = self.generate_batch(Xtest, Ytest, batch_size) + + total_training_samples = sum([len(target_text)-1 for target_text in Ytrain]) + total_testing_samples = sum([len(target_text)-1 for target_text in Ytest]) + train_num_batches = total_training_samples // batch_size + test_num_batches = total_testing_samples // batch_size + + history = self.model.fit_generator(generator=train_gen, steps_per_epoch=train_num_batches, + epochs=epochs, + verbose=VERBOSE, validation_data=test_gen, validation_steps=test_num_batches, + callbacks=[checkpoint]) + self.model.save_weights(weight_file_path) + return history + + def summarize(self, input_text): + input_seq = [] + input_wids = [] + for word in input_text.lower().split(' '): + idx = 1 # default [UNK] + if word in self.input_word2idx: + idx = self.input_word2idx[word] + input_wids.append(idx) + input_seq.append(input_wids) + input_seq = pad_sequences(input_seq, self.max_input_seq_length) + start_token = self.target_word2idx['START'] + wid_list = [start_token] + sum_input_seq = pad_sequences([wid_list], self.max_target_seq_length) + terminated = False + + target_text = '' + + while not terminated: + output_tokens = self.model.predict([input_seq, sum_input_seq]) + sample_token_idx = np.argmax(output_tokens[0, :]) + sample_word = self.target_idx2word[sample_token_idx] + wid_list = wid_list + [sample_token_idx] + + if sample_word != 'START' and sample_word != 'END': + target_text += ' ' + sample_word + + if sample_word == 'END' or len(wid_list) >= self.max_target_seq_length: + terminated = True + else: + sum_input_seq = pad_sequences([wid_list], self.max_target_seq_length) + return target_text.strip() + + +class RecursiveRNN2(object): + model_name = 'recursive-rnn-2' + """ + In this third alternative, the Encoder generates a context vector representation of the source document. + + This document is fed to the decoder at each step of the generated output sequence. This allows the decoder to build + up the same internal state as was used to generate the words in the output sequence so that it is primed to generate + the next word in the sequence. + + This process is then repeated by calling the model again and again for each word in the output sequence until a + maximum length or end-of-sequence token is generated. + """ + + MAX_DECODER_SEQ_LENGTH = 4 + + def __init__(self, config): + self.num_input_tokens = config['num_input_tokens'] + self.max_input_seq_length = config['max_input_seq_length'] + self.num_target_tokens = config['num_target_tokens'] + self.max_target_seq_length = config['max_target_seq_length'] + self.input_word2idx = config['input_word2idx'] + self.input_idx2word = config['input_idx2word'] + self.target_word2idx = config['target_word2idx'] + self.target_idx2word = config['target_idx2word'] + self.config = config + + self.version = 0 + if 'version' in config: + self.version = config['version'] + + # article input model + inputs1 = Input(shape=(self.max_input_seq_length,)) + article1 = Embedding(self.num_input_tokens, 128)(inputs1) + article2 = Dropout(0.3)(article1) + + # summary input model + inputs2 = Input(shape=(min(self.num_target_tokens, RecursiveRNN2.MAX_DECODER_SEQ_LENGTH), )) + summ1 = Embedding(self.num_target_tokens, 128)(inputs2) + summ2 = Dropout(0.3)(summ1) + summ3 = LSTM(128)(summ2) + summ4 = RepeatVector(self.max_input_seq_length)(summ3) + + # decoder model + decoder1 = concatenate([article2, summ4]) + decoder2 = LSTM(128)(decoder1) + outputs = Dense(self.num_target_tokens, activation='softmax')(decoder2) + # tie it together [article, summary] [word] + model = Model(inputs=[inputs1, inputs2], outputs=outputs) + model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) + + print(model.summary()) + + self.model = model + + def load_weights(self, weight_file_path): + if os.path.exists(weight_file_path): + print('loading weights from ', weight_file_path) + self.model.load_weights(weight_file_path) + + def transform_input_text(self, texts): + temp = [] + for line in texts: + x = [] + for word in line.lower().split(' '): + wid = 1 + if word in self.input_word2idx: + wid = self.input_word2idx[word] + x.append(wid) + if len(x) >= self.max_input_seq_length: + break + temp.append(x) + temp = pad_sequences(temp, maxlen=self.max_input_seq_length) + + print(temp.shape) + return temp + + def split_target_text(self, texts): + temp = [] + for line in texts: + x = [] + line2 = 'START ' + line.lower() + ' END' + for word in line2.split(' '): + x.append(word) + if len(x)+1 >= self.max_target_seq_length: + x.append('END') + break + temp.append(x) + return temp + + def generate_batch(self, x_samples, y_samples, batch_size): + encoder_input_data_batch = [] + decoder_input_data_batch = [] + decoder_target_data_batch = [] + line_idx = 0 + while True: + for recordIdx in range(0, len(x_samples)): + target_words = y_samples[recordIdx] + x = x_samples[recordIdx] + decoder_input_line = [] + + for idx in range(0, len(target_words)-1): + w2idx = 0 # default [UNK] + w = target_words[idx] + if w in self.target_word2idx: + w2idx = self.target_word2idx[w] + decoder_input_line = decoder_input_line + [w2idx] + decoder_target_label = np.zeros(self.num_target_tokens) + w2idx_next = 0 + if target_words[idx+1] in self.target_word2idx: + w2idx_next = self.target_word2idx[target_words[idx+1]] + if w2idx_next != 0: + decoder_target_label[w2idx_next] = 1 + + decoder_input_data_batch.append(decoder_input_line) + encoder_input_data_batch.append(x) + decoder_target_data_batch.append(decoder_target_label) + + line_idx += 1 + if line_idx >= batch_size: + yield [pad_sequences(encoder_input_data_batch, self.max_input_seq_length), + pad_sequences(decoder_input_data_batch, + min(self.num_target_tokens, RecursiveRNN2.MAX_DECODER_SEQ_LENGTH))], np.array(decoder_target_data_batch) + line_idx = 0 + encoder_input_data_batch = [] + decoder_input_data_batch = [] + decoder_target_data_batch = [] + + @staticmethod + def get_weight_file_path(model_dir_path): + return model_dir_path + '/' + RecursiveRNN2.model_name + '-weights.h5' + + @staticmethod + def get_config_file_path(model_dir_path): + return model_dir_path + '/' + RecursiveRNN2.model_name + '-config.npy' + + @staticmethod + def get_architecture_file_path(model_dir_path): + return model_dir_path + '/' + RecursiveRNN2.model_name + '-architecture.json' + + def fit(self, Xtrain, Ytrain, Xtest, Ytest, epochs=None, model_dir_path=None, batch_size=None): + if epochs is None: + epochs = DEFAULT_EPOCHS + if model_dir_path is None: + model_dir_path = './models' + if batch_size is None: + batch_size = DEFAULT_BATCH_SIZE + + self.version += 1 + self.config['version'] = self.version + + config_file_path = RecursiveRNN2.get_config_file_path(model_dir_path) + weight_file_path = RecursiveRNN2.get_weight_file_path(model_dir_path) + checkpoint = ModelCheckpoint(weight_file_path) + np.save(config_file_path, self.config) + architecture_file_path = RecursiveRNN2.get_architecture_file_path(model_dir_path) + open(architecture_file_path, 'w').write(self.model.to_json()) + + Ytrain = self.split_target_text(Ytrain) + Ytest = self.split_target_text(Ytest) + + Xtrain = self.transform_input_text(Xtrain) + Xtest = self.transform_input_text(Xtest) + + train_gen = self.generate_batch(Xtrain, Ytrain, batch_size) + test_gen = self.generate_batch(Xtest, Ytest, batch_size) + + total_training_samples = sum([len(target_text)-1 for target_text in Ytrain]) + total_testing_samples = sum([len(target_text)-1 for target_text in Ytest]) + train_num_batches = total_training_samples // batch_size + test_num_batches = total_testing_samples // batch_size + + history = self.model.fit_generator(generator=train_gen, steps_per_epoch=train_num_batches, + epochs=epochs, + verbose=VERBOSE, validation_data=test_gen, validation_steps=test_num_batches, + callbacks=[checkpoint]) + self.model.save_weights(weight_file_path) + return history + + def summarize(self, input_text): + input_seq = [] + input_wids = [] + for word in input_text.lower().split(' '): + idx = 1 # default [UNK] + if word in self.input_word2idx: + idx = self.input_word2idx[word] + input_wids.append(idx) + input_seq.append(input_wids) + input_seq = pad_sequences(input_seq, self.max_input_seq_length) + start_token = self.target_word2idx['START'] + wid_list = [start_token] + sum_input_seq = pad_sequences([wid_list], min(self.num_target_tokens, RecursiveRNN2.MAX_DECODER_SEQ_LENGTH)) + terminated = False + + target_text = '' + + while not terminated: + output_tokens = self.model.predict([input_seq, sum_input_seq]) + sample_token_idx = np.argmax(output_tokens[0, :]) + sample_word = self.target_idx2word[sample_token_idx] + wid_list = wid_list + [sample_token_idx] + + if sample_word != 'START' and sample_word != 'END': + target_text += ' ' + sample_word + + if sample_word == 'END' or len(wid_list) >= self.max_target_seq_length: + terminated = True + else: + sum_input_seq = pad_sequences([wid_list], min(self.num_target_tokens, RecursiveRNN2.MAX_DECODER_SEQ_LENGTH)) + return target_text.strip() + + +class RecursiveRNN3(object): + model_name = 'recursive-rnn-3' + """ + In this third alternative, the Encoder generates a context vector representation of the source document. + + This document is fed to the decoder at each step of the generated output sequence. This allows the decoder to build + up the same internal state as was used to generate the words in the output sequence so that it is primed to generate + the next word in the sequence. + + This process is then repeated by calling the model again and again for each word in the output sequence until a + maximum length or end-of-sequence token is generated. + """ + + def __init__(self, config): + self.num_input_tokens = config['num_input_tokens'] + self.max_input_seq_length = config['max_input_seq_length'] + self.num_target_tokens = config['num_target_tokens'] + self.max_target_seq_length = config['max_target_seq_length'] + self.input_word2idx = config['input_word2idx'] + self.input_idx2word = config['input_idx2word'] + self.target_word2idx = config['target_word2idx'] + self.target_idx2word = config['target_idx2word'] + self.config = config + + self.version = 0 + if 'version' in config: + self.version = config['version'] + + # article input model + inputs1 = Input(shape=(self.max_input_seq_length,)) + article1 = Embedding(self.num_input_tokens, 128)(inputs1) + article2 = LSTM(128)(article1) + article3 = RepeatVector(128)(article2) + # summary input model + inputs2 = Input(shape=(self.max_target_seq_length,)) + summ1 = Embedding(self.num_target_tokens, 128)(inputs2) + summ2 = LSTM(128)(summ1) + summ3 = RepeatVector(128)(summ2) + # decoder model + decoder1 = concatenate([article3, summ3]) + decoder2 = LSTM(128)(decoder1) + outputs = Dense(self.num_target_tokens, activation='softmax')(decoder2) + # tie it together [article, summary] [word] + model = Model(inputs=[inputs1, inputs2], outputs=outputs) + model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) + + print(model.summary()) + + self.model = model + + def load_weights(self, weight_file_path): + if os.path.exists(weight_file_path): + print('loading weights from ', weight_file_path) + self.model.load_weights(weight_file_path) + + def transform_input_text(self, texts): + temp = [] + for line in texts: + x = [] + for word in line.lower().split(' '): + wid = 1 + if word in self.input_word2idx: + wid = self.input_word2idx[word] + x.append(wid) + if len(x) >= self.max_input_seq_length: + break + temp.append(x) + temp = pad_sequences(temp, maxlen=self.max_input_seq_length) + + print(temp.shape) + return temp + + def split_target_text(self, texts): + temp = [] + for line in texts: + x = [] + line2 = 'START ' + line.lower() + ' END' + for word in line2.split(' '): + x.append(word) + if len(x)+1 >= self.max_target_seq_length: + x.append('END') + break + temp.append(x) + return temp + + def generate_batch(self, x_samples, y_samples, batch_size): + encoder_input_data_batch = [] + decoder_input_data_batch = [] + decoder_target_data_batch = [] + line_idx = 0 + while True: + for recordIdx in range(0, len(x_samples)): + target_words = y_samples[recordIdx] + x = x_samples[recordIdx] + decoder_input_line = [] + + for idx in range(0, len(target_words)-1): + w2idx = 0 # default [UNK] + w = target_words[idx] + if w in self.target_word2idx: + w2idx = self.target_word2idx[w] + decoder_input_line = decoder_input_line + [w2idx] + decoder_target_label = np.zeros(self.num_target_tokens) + w2idx_next = 0 + if target_words[idx+1] in self.target_word2idx: + w2idx_next = self.target_word2idx[target_words[idx+1]] + if w2idx_next != 0: + decoder_target_label[w2idx_next] = 1 + + decoder_input_data_batch.append(decoder_input_line) + encoder_input_data_batch.append(x) + decoder_target_data_batch.append(decoder_target_label) + + line_idx += 1 + if line_idx >= batch_size: + yield [pad_sequences(encoder_input_data_batch, self.max_input_seq_length), + pad_sequences(decoder_input_data_batch, + self.max_target_seq_length)], np.array(decoder_target_data_batch) + line_idx = 0 + encoder_input_data_batch = [] + decoder_input_data_batch = [] + decoder_target_data_batch = [] + + @staticmethod + def get_weight_file_path(model_dir_path): + return model_dir_path + '/' + RecursiveRNN2.model_name + '-weights.h5' + + @staticmethod + def get_config_file_path(model_dir_path): + return model_dir_path + '/' + RecursiveRNN2.model_name + '-config.npy' + + @staticmethod + def get_architecture_file_path(model_dir_path): + return model_dir_path + '/' + RecursiveRNN2.model_name + '-architecture.json' + + def fit(self, Xtrain, Ytrain, Xtest, Ytest, epochs=None, model_dir_path=None, batch_size=None): + if epochs is None: + epochs = DEFAULT_EPOCHS + if model_dir_path is None: + model_dir_path = './models' + if batch_size is None: + batch_size = DEFAULT_BATCH_SIZE + + self.version += 1 + self.config['version'] = self.version + + config_file_path = RecursiveRNN2.get_config_file_path(model_dir_path) + weight_file_path = RecursiveRNN2.get_weight_file_path(model_dir_path) + checkpoint = ModelCheckpoint(weight_file_path) + np.save(config_file_path, self.config) + architecture_file_path = RecursiveRNN2.get_architecture_file_path(model_dir_path) + open(architecture_file_path, 'w').write(self.model.to_json()) + + Ytrain = self.split_target_text(Ytrain) + Ytest = self.split_target_text(Ytest) + + Xtrain = self.transform_input_text(Xtrain) + Xtest = self.transform_input_text(Xtest) + + train_gen = self.generate_batch(Xtrain, Ytrain, batch_size) + test_gen = self.generate_batch(Xtest, Ytest, batch_size) + + total_training_samples = sum([len(target_text)-1 for target_text in Ytrain]) + total_testing_samples = sum([len(target_text)-1 for target_text in Ytest]) + train_num_batches = total_training_samples // batch_size + test_num_batches = total_testing_samples // batch_size + + history = self.model.fit_generator(generator=train_gen, steps_per_epoch=train_num_batches, + epochs=epochs, + verbose=VERBOSE, validation_data=test_gen, validation_steps=test_num_batches, + callbacks=[checkpoint]) + self.model.save_weights(weight_file_path) + return history + + def summarize(self, input_text): + input_seq = [] + input_wids = [] + for word in input_text.lower().split(' '): + idx = 1 # default [UNK] + if word in self.input_word2idx: + idx = self.input_word2idx[word] + input_wids.append(idx) + input_seq.append(input_wids) + input_seq = pad_sequences(input_seq, self.max_input_seq_length) + start_token = self.target_word2idx['START'] + wid_list = [start_token] + sum_input_seq = pad_sequences([wid_list], self.max_target_seq_length) + terminated = False + + target_text = '' + + while not terminated: + output_tokens = self.model.predict([input_seq, sum_input_seq]) + sample_token_idx = np.argmax(output_tokens[0, :]) + sample_word = self.target_idx2word[sample_token_idx] + wid_list = wid_list + [sample_token_idx] + + if sample_word != 'START' and sample_word != 'END': + target_text += ' ' + sample_word + + if sample_word == 'END' or len(wid_list) >= self.max_target_seq_length: + terminated = True + else: + sum_input_seq = pad_sequences([wid_list], self.max_target_seq_length) + return target_text.strip() diff --git a/keras_text_summarization/library/seq2seq.py b/keras_text_summarization/library/seq2seq.py index 361c72c..b2326c1 100644 --- a/keras_text_summarization/library/seq2seq.py +++ b/keras_text_summarization/library/seq2seq.py @@ -1,591 +1,591 @@ -from __future__ import print_function - -from keras.models import Model -from keras.layers import Embedding, Dense, Input -from keras.layers.recurrent import LSTM -from keras.preprocessing.sequence import pad_sequences -from keras.callbacks import ModelCheckpoint -from keras_text_summarization.library.utility.glove_loader import load_glove, GLOVE_EMBEDDING_SIZE -import numpy as np -import os - -HIDDEN_UNITS = 100 -DEFAULT_BATCH_SIZE = 64 -VERBOSE = 1 -DEFAULT_EPOCHS = 10 - - -class Seq2SeqSummarizer(object): - - model_name = 'seq2seq' - - def __init__(self, config): - self.num_input_tokens = config['num_input_tokens'] - self.max_input_seq_length = config['max_input_seq_length'] - self.num_target_tokens = config['num_target_tokens'] - self.max_target_seq_length = config['max_target_seq_length'] - self.input_word2idx = config['input_word2idx'] - self.input_idx2word = config['input_idx2word'] - self.target_word2idx = config['target_word2idx'] - self.target_idx2word = config['target_idx2word'] - self.config = config - - self.version = 0 - if 'version' in config: - self.version = config['version'] - - encoder_inputs = Input(shape=(None,), name='encoder_inputs') - encoder_embedding = Embedding(input_dim=self.num_input_tokens, output_dim=HIDDEN_UNITS, - input_length=self.max_input_seq_length, name='encoder_embedding') - encoder_lstm = LSTM(units=HIDDEN_UNITS, return_state=True, name='encoder_lstm') - encoder_outputs, encoder_state_h, encoder_state_c = encoder_lstm(encoder_embedding(encoder_inputs)) - encoder_states = [encoder_state_h, encoder_state_c] - - decoder_inputs = Input(shape=(None, self.num_target_tokens), name='decoder_inputs') - decoder_lstm = LSTM(units=HIDDEN_UNITS, return_state=True, return_sequences=True, name='decoder_lstm') - decoder_outputs, decoder_state_h, decoder_state_c = decoder_lstm(decoder_inputs, - initial_state=encoder_states) - decoder_dense = Dense(units=self.num_target_tokens, activation='softmax', name='decoder_dense') - decoder_outputs = decoder_dense(decoder_outputs) - - model = Model([encoder_inputs, decoder_inputs], decoder_outputs) - - model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy']) - - self.model = model - - self.encoder_model = Model(encoder_inputs, encoder_states) - - decoder_state_inputs = [Input(shape=(HIDDEN_UNITS,)), Input(shape=(HIDDEN_UNITS,))] - decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, initial_state=decoder_state_inputs) - decoder_states = [state_h, state_c] - decoder_outputs = decoder_dense(decoder_outputs) - self.decoder_model = Model([decoder_inputs] + decoder_state_inputs, [decoder_outputs] + decoder_states) - - def load_weights(self, weight_file_path): - if os.path.exists(weight_file_path): - self.model.load_weights(weight_file_path) - - def transform_input_text(self, texts): - temp = [] - for line in texts: - x = [] - for word in line.lower().split(' '): - wid = 1 - if word in self.input_word2idx: - wid = self.input_word2idx[word] - x.append(wid) - if len(x) >= self.max_input_seq_length: - break - temp.append(x) - temp = pad_sequences(temp, maxlen=self.max_input_seq_length) - - print(temp.shape) - return temp - - def transform_target_encoding(self, texts): - temp = [] - for line in texts: - x = [] - line2 = 'START ' + line.lower() + ' END' - for word in line2.split(' '): - x.append(word) - if len(x) >= self.max_target_seq_length: - break - temp.append(x) - - temp = np.array(temp) - print(temp.shape) - return temp - - def generate_batch(self, x_samples, y_samples, batch_size): - num_batches = len(x_samples) // batch_size - while True: - for batchIdx in range(0, num_batches): - start = batchIdx * batch_size - end = (batchIdx + 1) * batch_size - encoder_input_data_batch = pad_sequences(x_samples[start:end], self.max_input_seq_length) - decoder_target_data_batch = np.zeros(shape=(batch_size, self.max_target_seq_length, self.num_target_tokens)) - decoder_input_data_batch = np.zeros(shape=(batch_size, self.max_target_seq_length, self.num_target_tokens)) - for lineIdx, target_words in enumerate(y_samples[start:end]): - for idx, w in enumerate(target_words): - w2idx = 0 # default [UNK] - if w in self.target_word2idx: - w2idx = self.target_word2idx[w] - if w2idx != 0: - decoder_input_data_batch[lineIdx, idx, w2idx] = 1 - if idx > 0: - decoder_target_data_batch[lineIdx, idx - 1, w2idx] = 1 - yield [encoder_input_data_batch, decoder_input_data_batch], decoder_target_data_batch - - @staticmethod - def get_weight_file_path(model_dir_path): - return model_dir_path + '/' + Seq2SeqSummarizer.model_name + '-weights.h5' - - @staticmethod - def get_config_file_path(model_dir_path): - return model_dir_path + '/' + Seq2SeqSummarizer.model_name + '-config.npy' - - @staticmethod - def get_architecture_file_path(model_dir_path): - return model_dir_path + '/' + Seq2SeqSummarizer.model_name + '-architecture.json' - - def fit(self, Xtrain, Ytrain, Xtest, Ytest, epochs=None, batch_size=None, model_dir_path=None): - if epochs is None: - epochs = DEFAULT_EPOCHS - if model_dir_path is None: - model_dir_path = './demo/models' - if batch_size is None: - batch_size = DEFAULT_BATCH_SIZE - - self.version += 1 - self.config['version'] = self.version - config_file_path = Seq2SeqSummarizer.get_config_file_path(model_dir_path) - weight_file_path = Seq2SeqSummarizer.get_weight_file_path(model_dir_path) - checkpoint = ModelCheckpoint(weight_file_path) - np.save(config_file_path, self.config) - architecture_file_path = Seq2SeqSummarizer.get_architecture_file_path(model_dir_path) - open(architecture_file_path, 'w').write(self.model.to_json()) - - Ytrain = self.transform_target_encoding(Ytrain) - Ytest = self.transform_target_encoding(Ytest) - - Xtrain = self.transform_input_text(Xtrain) - Xtest = self.transform_input_text(Xtest) - - train_gen = self.generate_batch(Xtrain, Ytrain, batch_size) - test_gen = self.generate_batch(Xtest, Ytest, batch_size) - - train_num_batches = len(Xtrain) // batch_size - test_num_batches = len(Xtest) // batch_size - - history = self.model.fit_generator(generator=train_gen, steps_per_epoch=train_num_batches, - epochs=epochs, - verbose=VERBOSE, validation_data=test_gen, validation_steps=test_num_batches, - callbacks=[checkpoint]) - self.model.save_weights(weight_file_path) - return history - - def summarize(self, input_text): - input_seq = [] - input_wids = [] - for word in input_text.lower().split(' '): - idx = 1 # default [UNK] - if word in self.input_word2idx: - idx = self.input_word2idx[word] - input_wids.append(idx) - input_seq.append(input_wids) - input_seq = pad_sequences(input_seq, self.max_input_seq_length) - states_value = self.encoder_model.predict(input_seq) - target_seq = np.zeros((1, 1, self.num_target_tokens)) - target_seq[0, 0, self.target_word2idx['START']] = 1 - target_text = '' - target_text_len = 0 - terminated = False - while not terminated: - output_tokens, h, c = self.decoder_model.predict([target_seq] + states_value) - - sample_token_idx = np.argmax(output_tokens[0, -1, :]) - sample_word = self.target_idx2word[sample_token_idx] - target_text_len += 1 - - if sample_word != 'START' and sample_word != 'END': - target_text += ' ' + sample_word - - if sample_word == 'END' or target_text_len >= self.max_target_seq_length: - terminated = True - - target_seq = np.zeros((1, 1, self.num_target_tokens)) - target_seq[0, 0, sample_token_idx] = 1 - - states_value = [h, c] - return target_text.strip() - - -class Seq2SeqGloVeSummarizer(object): - - model_name = 'seq2seq-glove' - - def __init__(self, config): - self.max_input_seq_length = config['max_input_seq_length'] - self.num_target_tokens = config['num_target_tokens'] - self.max_target_seq_length = config['max_target_seq_length'] - self.target_word2idx = config['target_word2idx'] - self.target_idx2word = config['target_idx2word'] - self.version = 0 - if 'version' in config: - self.version = config['version'] - - self.word2em = dict() - if 'unknown_emb' in config: - self.unknown_emb = config['unknown_emb'] - else: - self.unknown_emb = np.random.rand(1, GLOVE_EMBEDDING_SIZE) - config['unknown_emb'] = self.unknown_emb - - self.config = config - - encoder_inputs = Input(shape=(None, GLOVE_EMBEDDING_SIZE), name='encoder_inputs') - encoder_lstm = LSTM(units=HIDDEN_UNITS, return_state=True, name='encoder_lstm') - encoder_outputs, encoder_state_h, encoder_state_c = encoder_lstm(encoder_inputs) - encoder_states = [encoder_state_h, encoder_state_c] - - decoder_inputs = Input(shape=(None, self.num_target_tokens), name='decoder_inputs') - decoder_lstm = LSTM(units=HIDDEN_UNITS, return_state=True, return_sequences=True, name='decoder_lstm') - decoder_outputs, decoder_state_h, decoder_state_c = decoder_lstm(decoder_inputs, - initial_state=encoder_states) - decoder_dense = Dense(units=self.num_target_tokens, activation='softmax', name='decoder_dense') - decoder_outputs = decoder_dense(decoder_outputs) - - model = Model([encoder_inputs, decoder_inputs], decoder_outputs) - - model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy']) - - self.model = model - - self.encoder_model = Model(encoder_inputs, encoder_states) - - decoder_state_inputs = [Input(shape=(HIDDEN_UNITS,)), Input(shape=(HIDDEN_UNITS,))] - decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, initial_state=decoder_state_inputs) - decoder_states = [state_h, state_c] - decoder_outputs = decoder_dense(decoder_outputs) - self.decoder_model = Model([decoder_inputs] + decoder_state_inputs, [decoder_outputs] + decoder_states) - - def load_weights(self, weight_file_path): - if os.path.exists(weight_file_path): - self.model.load_weights(weight_file_path) - - def load_glove(self, data_dir_path): - self.word2em = load_glove(data_dir_path) - - def transform_input_text(self, texts): - temp = [] - for line in texts: - x = np.zeros(shape=(self.max_input_seq_length, GLOVE_EMBEDDING_SIZE)) - for idx, word in enumerate(line.lower().split(' ')): - if idx >= self.max_input_seq_length: - break - emb = self.unknown_emb - if word in self.word2em: - emb = self.word2em[word] - x[idx, :] = emb - temp.append(x) - temp = pad_sequences(temp, maxlen=self.max_input_seq_length) - - print(temp.shape) - return temp - - def transform_target_encoding(self, texts): - temp = [] - for line in texts: - x = [] - line2 = 'START ' + line.lower() + ' END' - for word in line2.split(' '): - x.append(word) - if len(x) >= self.max_target_seq_length: - break - temp.append(x) - - temp = np.array(temp) - print(temp.shape) - return temp - - def generate_batch(self, x_samples, y_samples, batch_size): - num_batches = len(x_samples) // batch_size - while True: - for batchIdx in range(0, num_batches): - start = batchIdx * batch_size - end = (batchIdx + 1) * batch_size - encoder_input_data_batch = pad_sequences(x_samples[start:end], self.max_input_seq_length) - decoder_target_data_batch = np.zeros(shape=(batch_size, self.max_target_seq_length, self.num_target_tokens)) - decoder_input_data_batch = np.zeros(shape=(batch_size, self.max_target_seq_length, self.num_target_tokens)) - for lineIdx, target_words in enumerate(y_samples[start:end]): - for idx, w in enumerate(target_words): - w2idx = 0 # default [UNK] - if w in self.target_word2idx: - w2idx = self.target_word2idx[w] - if w2idx != 0: - decoder_input_data_batch[lineIdx, idx, w2idx] = 1 - if idx > 0: - decoder_target_data_batch[lineIdx, idx - 1, w2idx] = 1 - yield [encoder_input_data_batch, decoder_input_data_batch], decoder_target_data_batch - - @staticmethod - def get_weight_file_path(model_dir_path): - return model_dir_path + '/' + Seq2SeqGloVeSummarizer.model_name + '-weights.h5' - - @staticmethod - def get_config_file_path(model_dir_path): - return model_dir_path + '/' + Seq2SeqGloVeSummarizer.model_name + '-config.npy' - - @staticmethod - def get_architecture_file_path(model_dir_path): - return model_dir_path + '/' + Seq2SeqGloVeSummarizer.model_name + '-architecture.json' - - def fit(self, Xtrain, Ytrain, Xtest, Ytest, epochs=None, batch_size=None, model_dir_path=None): - if epochs is None: - epochs = DEFAULT_EPOCHS - if model_dir_path is None: - model_dir_path = './models' - if batch_size is None: - batch_size = DEFAULT_BATCH_SIZE - - self.version += 1 - self.config['version'] = self.version - config_file_path = Seq2SeqGloVeSummarizer.get_config_file_path(model_dir_path) - weight_file_path = Seq2SeqGloVeSummarizer.get_weight_file_path(model_dir_path) - checkpoint = ModelCheckpoint(weight_file_path) - np.save(config_file_path, self.config) - architecture_file_path = Seq2SeqGloVeSummarizer.get_architecture_file_path(model_dir_path) - open(architecture_file_path, 'w').write(self.model.to_json()) - - Ytrain = self.transform_target_encoding(Ytrain) - Ytest = self.transform_target_encoding(Ytest) - - Xtrain = self.transform_input_text(Xtrain) - Xtest = self.transform_input_text(Xtest) - - train_gen = self.generate_batch(Xtrain, Ytrain, batch_size) - test_gen = self.generate_batch(Xtest, Ytest, batch_size) - - train_num_batches = len(Xtrain) // batch_size - test_num_batches = len(Xtest) // batch_size - - history = self.model.fit_generator(generator=train_gen, steps_per_epoch=train_num_batches, - epochs=epochs, - verbose=VERBOSE, validation_data=test_gen, validation_steps=test_num_batches, - callbacks=[checkpoint]) - self.model.save_weights(weight_file_path) - return history - - def summarize(self, input_text): - input_seq = np.zeros(shape=(1, self.max_input_seq_length, GLOVE_EMBEDDING_SIZE)) - for idx, word in enumerate(input_text.lower().split(' ')): - if idx >= self.max_input_seq_length: - break - emb = self.unknown_emb # default [UNK] - if word in self.word2em: - emb = self.word2em[word] - input_seq[0, idx, :] = emb - states_value = self.encoder_model.predict(input_seq) - target_seq = np.zeros((1, 1, self.num_target_tokens)) - target_seq[0, 0, self.target_word2idx['START']] = 1 - target_text = '' - target_text_len = 0 - terminated = False - while not terminated: - output_tokens, h, c = self.decoder_model.predict([target_seq] + states_value) - - sample_token_idx = np.argmax(output_tokens[0, -1, :]) - sample_word = self.target_idx2word[sample_token_idx] - target_text_len += 1 - - if sample_word != 'START' and sample_word != 'END': - target_text += ' ' + sample_word - - if sample_word == 'END' or target_text_len >= self.max_target_seq_length: - terminated = True - - target_seq = np.zeros((1, 1, self.num_target_tokens)) - target_seq[0, 0, sample_token_idx] = 1 - - states_value = [h, c] - return target_text.strip() - - -class Seq2SeqGloVeSummarizerV2(object): - - model_name = 'seq2seq-glove-v2' - - def __init__(self, config): - self.max_input_seq_length = config['max_input_seq_length'] - self.num_target_tokens = config['num_target_tokens'] - self.max_target_seq_length = config['max_target_seq_length'] - self.target_word2idx = config['target_word2idx'] - self.target_idx2word = config['target_idx2word'] - self.version = 0 - if 'version' in config: - self.version = config['version'] - - self.word2em = dict() - if 'unknown_emb' in config: - self.unknown_emb = config['unknown_emb'] - else: - self.unknown_emb = np.random.rand(1, GLOVE_EMBEDDING_SIZE) - config['unknown_emb'] = self.unknown_emb - - self.config = config - - encoder_inputs = Input(shape=(None, GLOVE_EMBEDDING_SIZE), name='encoder_inputs') - encoder_lstm = LSTM(units=HIDDEN_UNITS, return_state=True, name='encoder_lstm') - encoder_outputs, encoder_state_h, encoder_state_c = encoder_lstm(encoder_inputs) - encoder_states = [encoder_state_h, encoder_state_c] - - decoder_inputs = Input(shape=(None, GLOVE_EMBEDDING_SIZE), name='decoder_inputs') - decoder_lstm = LSTM(units=HIDDEN_UNITS, return_state=True, return_sequences=True, name='decoder_lstm') - decoder_outputs, decoder_state_h, decoder_state_c = decoder_lstm(decoder_inputs, - initial_state=encoder_states) - decoder_dense = Dense(units=self.num_target_tokens, activation='softmax', name='decoder_dense') - decoder_outputs = decoder_dense(decoder_outputs) - - model = Model([encoder_inputs, decoder_inputs], decoder_outputs) - - model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy']) - - self.model = model - - self.encoder_model = Model(encoder_inputs, encoder_states) - - decoder_state_inputs = [Input(shape=(HIDDEN_UNITS,)), Input(shape=(HIDDEN_UNITS,))] - decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, initial_state=decoder_state_inputs) - decoder_states = [state_h, state_c] - decoder_outputs = decoder_dense(decoder_outputs) - self.decoder_model = Model([decoder_inputs] + decoder_state_inputs, [decoder_outputs] + decoder_states) - - def load_weights(self, weight_file_path): - if os.path.exists(weight_file_path): - self.model.load_weights(weight_file_path) - - def load_glove(self, data_dir_path): - self.word2em = load_glove(data_dir_path) - - def transform_input_text(self, texts): - temp = [] - for line in texts: - x = np.zeros(shape=(self.max_input_seq_length, GLOVE_EMBEDDING_SIZE)) - for idx, word in enumerate(line.lower().split(' ')): - if idx >= self.max_input_seq_length: - break - emb = self.unknown_emb - if word in self.word2em: - emb = self.word2em[word] - x[idx, :] = emb - temp.append(x) - temp = pad_sequences(temp, maxlen=self.max_input_seq_length) - - print(temp.shape) - return temp - - def transform_target_encoding(self, texts): - temp = [] - for line in texts: - x = [] - line2 = 'start ' + line.lower() + ' end' - for word in line2.split(' '): - x.append(word) - if len(x) >= self.max_target_seq_length: - break - temp.append(x) - - temp = np.array(temp) - print(temp.shape) - return temp - - def generate_batch(self, x_samples, y_samples, batch_size): - num_batches = len(x_samples) // batch_size - while True: - for batchIdx in range(0, num_batches): - start = batchIdx * batch_size - end = (batchIdx + 1) * batch_size - encoder_input_data_batch = pad_sequences(x_samples[start:end], self.max_input_seq_length) - decoder_target_data_batch = np.zeros(shape=(batch_size, self.max_target_seq_length, self.num_target_tokens)) - decoder_input_data_batch = np.zeros(shape=(batch_size, self.max_target_seq_length, GLOVE_EMBEDDING_SIZE)) - for lineIdx, target_words in enumerate(y_samples[start:end]): - for idx, w in enumerate(target_words): - w2idx = 0 # default [UNK] - if w in self.word2em: - emb = self.unknown_emb - decoder_input_data_batch[lineIdx, idx, :] = emb - if w in self.target_word2idx: - w2idx = self.target_word2idx[w] - if w2idx != 0: - if idx > 0: - decoder_target_data_batch[lineIdx, idx - 1, w2idx] = 1 - yield [encoder_input_data_batch, decoder_input_data_batch], decoder_target_data_batch - - @staticmethod - def get_weight_file_path(model_dir_path): - return model_dir_path + '/' + Seq2SeqGloVeSummarizerV2.model_name + '-weights.h5' - - @staticmethod - def get_config_file_path(model_dir_path): - return model_dir_path + '/' + Seq2SeqGloVeSummarizerV2.model_name + '-config.npy' - - @staticmethod - def get_architecture_file_path(model_dir_path): - return model_dir_path + '/' + Seq2SeqGloVeSummarizerV2.model_name + '-architecture.json' - - def fit(self, Xtrain, Ytrain, Xtest, Ytest, epochs=None, batch_size=None, model_dir_path=None): - if epochs is None: - epochs = DEFAULT_EPOCHS - if model_dir_path is None: - model_dir_path = './models' - if batch_size is None: - batch_size = DEFAULT_BATCH_SIZE - - self.version += 1 - self.config['version'] = self.version - config_file_path = Seq2SeqGloVeSummarizerV2.get_config_file_path(model_dir_path) - weight_file_path = Seq2SeqGloVeSummarizerV2.get_weight_file_path(model_dir_path) - checkpoint = ModelCheckpoint(weight_file_path) - np.save(config_file_path, self.config) - architecture_file_path = Seq2SeqGloVeSummarizerV2.get_architecture_file_path(model_dir_path) - open(architecture_file_path, 'w').write(self.model.to_json()) - - Ytrain = self.transform_target_encoding(Ytrain) - Ytest = self.transform_target_encoding(Ytest) - - Xtrain = self.transform_input_text(Xtrain) - Xtest = self.transform_input_text(Xtest) - - train_gen = self.generate_batch(Xtrain, Ytrain, batch_size) - test_gen = self.generate_batch(Xtest, Ytest, batch_size) - - train_num_batches = len(Xtrain) // batch_size - test_num_batches = len(Xtest) // batch_size - - history = self.model.fit_generator(generator=train_gen, steps_per_epoch=train_num_batches, - epochs=epochs, - verbose=VERBOSE, validation_data=test_gen, validation_steps=test_num_batches, - callbacks=[checkpoint]) - self.model.save_weights(weight_file_path) - return history - - def summarize(self, input_text): - input_seq = np.zeros(shape=(1, self.max_input_seq_length, GLOVE_EMBEDDING_SIZE)) - for idx, word in enumerate(input_text.lower().split(' ')): - if idx >= self.max_input_seq_length: - break - emb = self.unknown_emb # default [UNK] - if word in self.word2em: - emb = self.word2em[word] - input_seq[0, idx, :] = emb - states_value = self.encoder_model.predict(input_seq) - target_seq = np.zeros((1, 1, GLOVE_EMBEDDING_SIZE)) - target_seq[0, 0, :] = self.word2em['start'] - target_text = '' - target_text_len = 0 - terminated = False - while not terminated: - output_tokens, h, c = self.decoder_model.predict([target_seq] + states_value) - - sample_token_idx = np.argmax(output_tokens[0, -1, :]) - sample_word = self.target_idx2word[sample_token_idx] - target_text_len += 1 - - if sample_word != 'start' and sample_word != 'end': - target_text += ' ' + sample_word - - if sample_word == 'end' or target_text_len >= self.max_target_seq_length: - terminated = True - - if sample_word in self.word2em: - target_seq[0, 0, :] = self.word2em[sample_word] - else: - target_seq[0, 0, :] = self.unknown_emb - - states_value = [h, c] - return target_text.strip() - - - +from __future__ import print_function + +from keras.models import Model +from keras.layers import Embedding, Dense, Input +from keras.layers.recurrent import LSTM +from keras.preprocessing.sequence import pad_sequences +from keras.callbacks import ModelCheckpoint +from keras_text_summarization.library.utility.glove_loader import load_glove, GLOVE_EMBEDDING_SIZE +import numpy as np +import os + +HIDDEN_UNITS = 100 +DEFAULT_BATCH_SIZE = 64 +VERBOSE = 1 +DEFAULT_EPOCHS = 10 + + +class Seq2SeqSummarizer(object): + + model_name = 'seq2seq' + + def __init__(self, config): + self.num_input_tokens = config['num_input_tokens'] + self.max_input_seq_length = config['max_input_seq_length'] + self.num_target_tokens = config['num_target_tokens'] + self.max_target_seq_length = config['max_target_seq_length'] + self.input_word2idx = config['input_word2idx'] + self.input_idx2word = config['input_idx2word'] + self.target_word2idx = config['target_word2idx'] + self.target_idx2word = config['target_idx2word'] + self.config = config + + self.version = 0 + if 'version' in config: + self.version = config['version'] + + encoder_inputs = Input(shape=(None,), name='encoder_inputs') + encoder_embedding = Embedding(input_dim=self.num_input_tokens, output_dim=HIDDEN_UNITS, + input_length=self.max_input_seq_length, name='encoder_embedding') + encoder_lstm = LSTM(units=HIDDEN_UNITS, return_state=True, name='encoder_lstm') + encoder_outputs, encoder_state_h, encoder_state_c = encoder_lstm(encoder_embedding(encoder_inputs)) + encoder_states = [encoder_state_h, encoder_state_c] + + decoder_inputs = Input(shape=(None, self.num_target_tokens), name='decoder_inputs') + decoder_lstm = LSTM(units=HIDDEN_UNITS, return_state=True, return_sequences=True, name='decoder_lstm') + decoder_outputs, decoder_state_h, decoder_state_c = decoder_lstm(decoder_inputs, + initial_state=encoder_states) + decoder_dense = Dense(units=self.num_target_tokens, activation='softmax', name='decoder_dense') + decoder_outputs = decoder_dense(decoder_outputs) + + model = Model([encoder_inputs, decoder_inputs], decoder_outputs) + + model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy']) + + self.model = model + + self.encoder_model = Model(encoder_inputs, encoder_states) + + decoder_state_inputs = [Input(shape=(HIDDEN_UNITS,)), Input(shape=(HIDDEN_UNITS,))] + decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, initial_state=decoder_state_inputs) + decoder_states = [state_h, state_c] + decoder_outputs = decoder_dense(decoder_outputs) + self.decoder_model = Model([decoder_inputs] + decoder_state_inputs, [decoder_outputs] + decoder_states) + + def load_weights(self, weight_file_path): + if os.path.exists(weight_file_path): + self.model.load_weights(weight_file_path) + + def transform_input_text(self, texts): + temp = [] + for line in texts: + x = [] + for word in line.lower().split(' '): + wid = 1 + if word in self.input_word2idx: + wid = self.input_word2idx[word] + x.append(wid) + if len(x) >= self.max_input_seq_length: + break + temp.append(x) + temp = pad_sequences(temp, maxlen=self.max_input_seq_length) + + print(temp.shape) + return temp + + def transform_target_encoding(self, texts): + temp = [] + for line in texts: + x = [] + line2 = 'START ' + line.lower() + ' END' + for word in line2.split(' '): + x.append(word) + if len(x) >= self.max_target_seq_length: + break + temp.append(x) + + temp = np.array(temp) + print(temp.shape) + return temp + + def generate_batch(self, x_samples, y_samples, batch_size): + num_batches = len(x_samples) // batch_size + while True: + for batchIdx in range(0, num_batches): + start = batchIdx * batch_size + end = (batchIdx + 1) * batch_size + encoder_input_data_batch = pad_sequences(x_samples[start:end], self.max_input_seq_length) + decoder_target_data_batch = np.zeros(shape=(batch_size, self.max_target_seq_length, self.num_target_tokens)) + decoder_input_data_batch = np.zeros(shape=(batch_size, self.max_target_seq_length, self.num_target_tokens)) + for lineIdx, target_words in enumerate(y_samples[start:end]): + for idx, w in enumerate(target_words): + w2idx = 0 # default [UNK] + if w in self.target_word2idx: + w2idx = self.target_word2idx[w] + if w2idx != 0: + decoder_input_data_batch[lineIdx, idx, w2idx] = 1 + if idx > 0: + decoder_target_data_batch[lineIdx, idx - 1, w2idx] = 1 + yield [encoder_input_data_batch, decoder_input_data_batch], decoder_target_data_batch + + @staticmethod + def get_weight_file_path(model_dir_path): + return model_dir_path + '/' + Seq2SeqSummarizer.model_name + '-weights.h5' + + @staticmethod + def get_config_file_path(model_dir_path): + return model_dir_path + '/' + Seq2SeqSummarizer.model_name + '-config.npy' + + @staticmethod + def get_architecture_file_path(model_dir_path): + return model_dir_path + '/' + Seq2SeqSummarizer.model_name + '-architecture.json' + + def fit(self, Xtrain, Ytrain, Xtest, Ytest, epochs=None, batch_size=None, model_dir_path=None): + if epochs is None: + epochs = DEFAULT_EPOCHS + if model_dir_path is None: + model_dir_path = './models' + if batch_size is None: + batch_size = DEFAULT_BATCH_SIZE + + self.version += 1 + self.config['version'] = self.version + config_file_path = Seq2SeqSummarizer.get_config_file_path(model_dir_path) + weight_file_path = Seq2SeqSummarizer.get_weight_file_path(model_dir_path) + checkpoint = ModelCheckpoint(weight_file_path) + np.save(config_file_path, self.config) + architecture_file_path = Seq2SeqSummarizer.get_architecture_file_path(model_dir_path) + open(architecture_file_path, 'w').write(self.model.to_json()) + + Ytrain = self.transform_target_encoding(Ytrain) + Ytest = self.transform_target_encoding(Ytest) + + Xtrain = self.transform_input_text(Xtrain) + Xtest = self.transform_input_text(Xtest) + + train_gen = self.generate_batch(Xtrain, Ytrain, batch_size) + test_gen = self.generate_batch(Xtest, Ytest, batch_size) + + train_num_batches = len(Xtrain) // batch_size + test_num_batches = len(Xtest) // batch_size + + history = self.model.fit_generator(generator=train_gen, steps_per_epoch=train_num_batches, + epochs=epochs, + verbose=VERBOSE, validation_data=test_gen, validation_steps=test_num_batches, + callbacks=[checkpoint]) + self.model.save_weights(weight_file_path) + return history + + def summarize(self, input_text): + input_seq = [] + input_wids = [] + for word in input_text.lower().split(' '): + idx = 1 # default [UNK] + if word in self.input_word2idx: + idx = self.input_word2idx[word] + input_wids.append(idx) + input_seq.append(input_wids) + input_seq = pad_sequences(input_seq, self.max_input_seq_length) + states_value = self.encoder_model.predict(input_seq) + target_seq = np.zeros((1, 1, self.num_target_tokens)) + target_seq[0, 0, self.target_word2idx['START']] = 1 + target_text = '' + target_text_len = 0 + terminated = False + while not terminated: + output_tokens, h, c = self.decoder_model.predict([target_seq] + states_value) + + sample_token_idx = np.argmax(output_tokens[0, -1, :]) + sample_word = self.target_idx2word[sample_token_idx] + target_text_len += 1 + + if sample_word != 'START' and sample_word != 'END': + target_text += ' ' + sample_word + + if sample_word == 'END' or target_text_len >= self.max_target_seq_length: + terminated = True + + target_seq = np.zeros((1, 1, self.num_target_tokens)) + target_seq[0, 0, sample_token_idx] = 1 + + states_value = [h, c] + return target_text.strip() + + +class Seq2SeqGloVeSummarizer(object): + + model_name = 'seq2seq-glove' + + def __init__(self, config): + self.max_input_seq_length = config['max_input_seq_length'] + self.num_target_tokens = config['num_target_tokens'] + self.max_target_seq_length = config['max_target_seq_length'] + self.target_word2idx = config['target_word2idx'] + self.target_idx2word = config['target_idx2word'] + self.version = 0 + if 'version' in config: + self.version = config['version'] + + self.word2em = dict() + if 'unknown_emb' in config: + self.unknown_emb = config['unknown_emb'] + else: + self.unknown_emb = np.random.rand(1, GLOVE_EMBEDDING_SIZE) + config['unknown_emb'] = self.unknown_emb + + self.config = config + + encoder_inputs = Input(shape=(None, GLOVE_EMBEDDING_SIZE), name='encoder_inputs') + encoder_lstm = LSTM(units=HIDDEN_UNITS, return_state=True, name='encoder_lstm') + encoder_outputs, encoder_state_h, encoder_state_c = encoder_lstm(encoder_inputs) + encoder_states = [encoder_state_h, encoder_state_c] + + decoder_inputs = Input(shape=(None, self.num_target_tokens), name='decoder_inputs') + decoder_lstm = LSTM(units=HIDDEN_UNITS, return_state=True, return_sequences=True, name='decoder_lstm') + decoder_outputs, decoder_state_h, decoder_state_c = decoder_lstm(decoder_inputs, + initial_state=encoder_states) + decoder_dense = Dense(units=self.num_target_tokens, activation='softmax', name='decoder_dense') + decoder_outputs = decoder_dense(decoder_outputs) + + model = Model([encoder_inputs, decoder_inputs], decoder_outputs) + + model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy']) + + self.model = model + + self.encoder_model = Model(encoder_inputs, encoder_states) + + decoder_state_inputs = [Input(shape=(HIDDEN_UNITS,)), Input(shape=(HIDDEN_UNITS,))] + decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, initial_state=decoder_state_inputs) + decoder_states = [state_h, state_c] + decoder_outputs = decoder_dense(decoder_outputs) + self.decoder_model = Model([decoder_inputs] + decoder_state_inputs, [decoder_outputs] + decoder_states) + + def load_weights(self, weight_file_path): + if os.path.exists(weight_file_path): + self.model.load_weights(weight_file_path) + + def load_glove(self, data_dir_path): + self.word2em = load_glove(data_dir_path) + + def transform_input_text(self, texts): + temp = [] + for line in texts: + x = np.zeros(shape=(self.max_input_seq_length, GLOVE_EMBEDDING_SIZE)) + for idx, word in enumerate(line.lower().split(' ')): + if idx >= self.max_input_seq_length: + break + emb = self.unknown_emb + if word in self.word2em: + emb = self.word2em[word] + x[idx, :] = emb + temp.append(x) + temp = pad_sequences(temp, maxlen=self.max_input_seq_length) + + print(temp.shape) + return temp + + def transform_target_encoding(self, texts): + temp = [] + for line in texts: + x = [] + line2 = 'START ' + line.lower() + ' END' + for word in line2.split(' '): + x.append(word) + if len(x) >= self.max_target_seq_length: + break + temp.append(x) + + temp = np.array(temp) + print(temp.shape) + return temp + + def generate_batch(self, x_samples, y_samples, batch_size): + num_batches = len(x_samples) // batch_size + while True: + for batchIdx in range(0, num_batches): + start = batchIdx * batch_size + end = (batchIdx + 1) * batch_size + encoder_input_data_batch = pad_sequences(x_samples[start:end], self.max_input_seq_length) + decoder_target_data_batch = np.zeros(shape=(batch_size, self.max_target_seq_length, self.num_target_tokens)) + decoder_input_data_batch = np.zeros(shape=(batch_size, self.max_target_seq_length, self.num_target_tokens)) + for lineIdx, target_words in enumerate(y_samples[start:end]): + for idx, w in enumerate(target_words): + w2idx = 0 # default [UNK] + if w in self.target_word2idx: + w2idx = self.target_word2idx[w] + if w2idx != 0: + decoder_input_data_batch[lineIdx, idx, w2idx] = 1 + if idx > 0: + decoder_target_data_batch[lineIdx, idx - 1, w2idx] = 1 + yield [encoder_input_data_batch, decoder_input_data_batch], decoder_target_data_batch + + @staticmethod + def get_weight_file_path(model_dir_path): + return model_dir_path + '/' + Seq2SeqGloVeSummarizer.model_name + '-weights.h5' + + @staticmethod + def get_config_file_path(model_dir_path): + return model_dir_path + '/' + Seq2SeqGloVeSummarizer.model_name + '-config.npy' + + @staticmethod + def get_architecture_file_path(model_dir_path): + return model_dir_path + '/' + Seq2SeqGloVeSummarizer.model_name + '-architecture.json' + + def fit(self, Xtrain, Ytrain, Xtest, Ytest, epochs=None, batch_size=None, model_dir_path=None): + if epochs is None: + epochs = DEFAULT_EPOCHS + if model_dir_path is None: + model_dir_path = './models' + if batch_size is None: + batch_size = DEFAULT_BATCH_SIZE + + self.version += 1 + self.config['version'] = self.version + config_file_path = Seq2SeqGloVeSummarizer.get_config_file_path(model_dir_path) + weight_file_path = Seq2SeqGloVeSummarizer.get_weight_file_path(model_dir_path) + checkpoint = ModelCheckpoint(weight_file_path) + np.save(config_file_path, self.config) + architecture_file_path = Seq2SeqGloVeSummarizer.get_architecture_file_path(model_dir_path) + open(architecture_file_path, 'w').write(self.model.to_json()) + + Ytrain = self.transform_target_encoding(Ytrain) + Ytest = self.transform_target_encoding(Ytest) + + Xtrain = self.transform_input_text(Xtrain) + Xtest = self.transform_input_text(Xtest) + + train_gen = self.generate_batch(Xtrain, Ytrain, batch_size) + test_gen = self.generate_batch(Xtest, Ytest, batch_size) + + train_num_batches = len(Xtrain) // batch_size + test_num_batches = len(Xtest) // batch_size + + history = self.model.fit_generator(generator=train_gen, steps_per_epoch=train_num_batches, + epochs=epochs, + verbose=VERBOSE, validation_data=test_gen, validation_steps=test_num_batches, + callbacks=[checkpoint]) + self.model.save_weights(weight_file_path) + return history + + def summarize(self, input_text): + input_seq = np.zeros(shape=(1, self.max_input_seq_length, GLOVE_EMBEDDING_SIZE)) + for idx, word in enumerate(input_text.lower().split(' ')): + if idx >= self.max_input_seq_length: + break + emb = self.unknown_emb # default [UNK] + if word in self.word2em: + emb = self.word2em[word] + input_seq[0, idx, :] = emb + states_value = self.encoder_model.predict(input_seq) + target_seq = np.zeros((1, 1, self.num_target_tokens)) + target_seq[0, 0, self.target_word2idx['START']] = 1 + target_text = '' + target_text_len = 0 + terminated = False + while not terminated: + output_tokens, h, c = self.decoder_model.predict([target_seq] + states_value) + + sample_token_idx = np.argmax(output_tokens[0, -1, :]) + sample_word = self.target_idx2word[sample_token_idx] + target_text_len += 1 + + if sample_word != 'START' and sample_word != 'END': + target_text += ' ' + sample_word + + if sample_word == 'END' or target_text_len >= self.max_target_seq_length: + terminated = True + + target_seq = np.zeros((1, 1, self.num_target_tokens)) + target_seq[0, 0, sample_token_idx] = 1 + + states_value = [h, c] + return target_text.strip() + + +class Seq2SeqGloVeSummarizerV2(object): + + model_name = 'seq2seq-glove-v2' + + def __init__(self, config): + self.max_input_seq_length = config['max_input_seq_length'] + self.num_target_tokens = config['num_target_tokens'] + self.max_target_seq_length = config['max_target_seq_length'] + self.target_word2idx = config['target_word2idx'] + self.target_idx2word = config['target_idx2word'] + self.version = 0 + if 'version' in config: + self.version = config['version'] + + self.word2em = dict() + if 'unknown_emb' in config: + self.unknown_emb = config['unknown_emb'] + else: + self.unknown_emb = np.random.rand(1, GLOVE_EMBEDDING_SIZE) + config['unknown_emb'] = self.unknown_emb + + self.config = config + + encoder_inputs = Input(shape=(None, GLOVE_EMBEDDING_SIZE), name='encoder_inputs') + encoder_lstm = LSTM(units=HIDDEN_UNITS, return_state=True, name='encoder_lstm') + encoder_outputs, encoder_state_h, encoder_state_c = encoder_lstm(encoder_inputs) + encoder_states = [encoder_state_h, encoder_state_c] + + decoder_inputs = Input(shape=(None, GLOVE_EMBEDDING_SIZE), name='decoder_inputs') + decoder_lstm = LSTM(units=HIDDEN_UNITS, return_state=True, return_sequences=True, name='decoder_lstm') + decoder_outputs, decoder_state_h, decoder_state_c = decoder_lstm(decoder_inputs, + initial_state=encoder_states) + decoder_dense = Dense(units=self.num_target_tokens, activation='softmax', name='decoder_dense') + decoder_outputs = decoder_dense(decoder_outputs) + + model = Model([encoder_inputs, decoder_inputs], decoder_outputs) + + model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy']) + + self.model = model + + self.encoder_model = Model(encoder_inputs, encoder_states) + + decoder_state_inputs = [Input(shape=(HIDDEN_UNITS,)), Input(shape=(HIDDEN_UNITS,))] + decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, initial_state=decoder_state_inputs) + decoder_states = [state_h, state_c] + decoder_outputs = decoder_dense(decoder_outputs) + self.decoder_model = Model([decoder_inputs] + decoder_state_inputs, [decoder_outputs] + decoder_states) + + def load_weights(self, weight_file_path): + if os.path.exists(weight_file_path): + self.model.load_weights(weight_file_path) + + def load_glove(self, data_dir_path): + self.word2em = load_glove(data_dir_path) + + def transform_input_text(self, texts): + temp = [] + for line in texts: + x = np.zeros(shape=(self.max_input_seq_length, GLOVE_EMBEDDING_SIZE)) + for idx, word in enumerate(line.lower().split(' ')): + if idx >= self.max_input_seq_length: + break + emb = self.unknown_emb + if word in self.word2em: + emb = self.word2em[word] + x[idx, :] = emb + temp.append(x) + temp = pad_sequences(temp, maxlen=self.max_input_seq_length) + + print(temp.shape) + return temp + + def transform_target_encoding(self, texts): + temp = [] + for line in texts: + x = [] + line2 = 'start ' + line.lower() + ' end' + for word in line2.split(' '): + x.append(word) + if len(x) >= self.max_target_seq_length: + break + temp.append(x) + + temp = np.array(temp) + print(temp.shape) + return temp + + def generate_batch(self, x_samples, y_samples, batch_size): + num_batches = len(x_samples) // batch_size + while True: + for batchIdx in range(0, num_batches): + start = batchIdx * batch_size + end = (batchIdx + 1) * batch_size + encoder_input_data_batch = pad_sequences(x_samples[start:end], self.max_input_seq_length) + decoder_target_data_batch = np.zeros(shape=(batch_size, self.max_target_seq_length, self.num_target_tokens)) + decoder_input_data_batch = np.zeros(shape=(batch_size, self.max_target_seq_length, GLOVE_EMBEDDING_SIZE)) + for lineIdx, target_words in enumerate(y_samples[start:end]): + for idx, w in enumerate(target_words): + w2idx = 0 # default [UNK] + if w in self.word2em: + emb = self.unknown_emb + decoder_input_data_batch[lineIdx, idx, :] = emb + if w in self.target_word2idx: + w2idx = self.target_word2idx[w] + if w2idx != 0: + if idx > 0: + decoder_target_data_batch[lineIdx, idx - 1, w2idx] = 1 + yield [encoder_input_data_batch, decoder_input_data_batch], decoder_target_data_batch + + @staticmethod + def get_weight_file_path(model_dir_path): + return model_dir_path + '/' + Seq2SeqGloVeSummarizerV2.model_name + '-weights.h5' + + @staticmethod + def get_config_file_path(model_dir_path): + return model_dir_path + '/' + Seq2SeqGloVeSummarizerV2.model_name + '-config.npy' + + @staticmethod + def get_architecture_file_path(model_dir_path): + return model_dir_path + '/' + Seq2SeqGloVeSummarizerV2.model_name + '-architecture.json' + + def fit(self, Xtrain, Ytrain, Xtest, Ytest, epochs=None, batch_size=None, model_dir_path=None): + if epochs is None: + epochs = DEFAULT_EPOCHS + if model_dir_path is None: + model_dir_path = './models' + if batch_size is None: + batch_size = DEFAULT_BATCH_SIZE + + self.version += 1 + self.config['version'] = self.version + config_file_path = Seq2SeqGloVeSummarizerV2.get_config_file_path(model_dir_path) + weight_file_path = Seq2SeqGloVeSummarizerV2.get_weight_file_path(model_dir_path) + checkpoint = ModelCheckpoint(weight_file_path) + np.save(config_file_path, self.config) + architecture_file_path = Seq2SeqGloVeSummarizerV2.get_architecture_file_path(model_dir_path) + open(architecture_file_path, 'w').write(self.model.to_json()) + + Ytrain = self.transform_target_encoding(Ytrain) + Ytest = self.transform_target_encoding(Ytest) + + Xtrain = self.transform_input_text(Xtrain) + Xtest = self.transform_input_text(Xtest) + + train_gen = self.generate_batch(Xtrain, Ytrain, batch_size) + test_gen = self.generate_batch(Xtest, Ytest, batch_size) + + train_num_batches = len(Xtrain) // batch_size + test_num_batches = len(Xtest) // batch_size + + history = self.model.fit_generator(generator=train_gen, steps_per_epoch=train_num_batches, + epochs=epochs, + verbose=VERBOSE, validation_data=test_gen, validation_steps=test_num_batches, + callbacks=[checkpoint]) + self.model.save_weights(weight_file_path) + return history + + def summarize(self, input_text): + input_seq = np.zeros(shape=(1, self.max_input_seq_length, GLOVE_EMBEDDING_SIZE)) + for idx, word in enumerate(input_text.lower().split(' ')): + if idx >= self.max_input_seq_length: + break + emb = self.unknown_emb # default [UNK] + if word in self.word2em: + emb = self.word2em[word] + input_seq[0, idx, :] = emb + states_value = self.encoder_model.predict(input_seq) + target_seq = np.zeros((1, 1, GLOVE_EMBEDDING_SIZE)) + target_seq[0, 0, :] = self.word2em['start'] + target_text = '' + target_text_len = 0 + terminated = False + while not terminated: + output_tokens, h, c = self.decoder_model.predict([target_seq] + states_value) + + sample_token_idx = np.argmax(output_tokens[0, -1, :]) + sample_word = self.target_idx2word[sample_token_idx] + target_text_len += 1 + + if sample_word != 'start' and sample_word != 'end': + target_text += ' ' + sample_word + + if sample_word == 'end' or target_text_len >= self.max_target_seq_length: + terminated = True + + if sample_word in self.word2em: + target_seq[0, 0, :] = self.word2em[sample_word] + else: + target_seq[0, 0, :] = self.unknown_emb + + states_value = [h, c] + return target_text.strip() + + + diff --git a/keras_text_summarization/library/utility/device_utils.py b/keras_text_summarization/library/utility/device_utils.py index cf79a19..536274a 100644 --- a/keras_text_summarization/library/utility/device_utils.py +++ b/keras_text_summarization/library/utility/device_utils.py @@ -1,22 +1,22 @@ -import tensorflow as tf -from keras import backend as K - - -def init_devices(device_type=None): - if device_type is None: - device_type = 'cpu' - - num_cores = 4 - - if device_type == 'gpu': - num_GPU = 1 - num_CPU = 1 - else: - num_CPU = 1 - num_GPU = 0 - - config = tf.ConfigProto(intra_op_parallelism_threads=num_cores, - inter_op_parallelism_threads=num_cores, allow_soft_placement=True, - device_count={'CPU': num_CPU, 'GPU': num_GPU}) - session = tf.Session(config=config) - K.set_session(session) +import tensorflow as tf +from keras import backend as K + + +def init_devices(device_type=None): + if device_type is None: + device_type = 'cpu' + + num_cores = 4 + + if device_type == 'gpu': + num_GPU = 1 + num_CPU = 1 + else: + num_CPU = 1 + num_GPU = 0 + + config = tf.ConfigProto(intra_op_parallelism_threads=num_cores, + inter_op_parallelism_threads=num_cores, allow_soft_placement=True, + device_count={'CPU': num_CPU, 'GPU': num_GPU}) + session = tf.Session(config=config) + K.set_session(session) diff --git a/keras_text_summarization/library/utility/glove_loader.py b/keras_text_summarization/library/utility/glove_loader.py index df81e39..7763ad3 100644 --- a/keras_text_summarization/library/utility/glove_loader.py +++ b/keras_text_summarization/library/utility/glove_loader.py @@ -1,79 +1,78 @@ -try: - import urllib.request -except: - import urllib -import os -import sys -import zipfile -import numpy as np - -GLOVE_EMBEDDING_SIZE = 100 - - -def reporthook(block_num, block_size, total_size): - read_so_far = block_num * block_size - if total_size > 0: - percent = read_so_far * 1e2 / total_size - s = "\r%5.1f%% %*d / %d" % ( - percent, len(str(total_size)), read_so_far, total_size) - sys.stderr.write(s) - if read_so_far >= total_size: # near the end - sys.stderr.write("\n") - else: # total size is unknown - sys.stderr.write("read %d\n" % (read_so_far,)) - - -def download_glove(data_dir_path=None): - if data_dir_path is None: - data_dir_path = 'very_large_data' - glove_model_path = data_dir_path + "/glove.6B." + str(GLOVE_EMBEDDING_SIZE) + "d.txt" - if not os.path.exists(glove_model_path): - - glove_zip = data_dir_path + '/glove.6B.zip' - - if not os.path.exists(data_dir_path): - os.makedirs(data_dir_path) - - if not os.path.exists(glove_zip): - print('glove file does not exist, downloading from internet') - if sys.version[0]=="3": - urllib.request.urlretrieve(url='http://nlp.stanford.edu/data/glove.6B.zip', filename=glove_zip, - reporthook=reporthook) - elif sys.version[0]=="2": - urllib.urlretrieve(url='http://nlp.stanford.edu/data/glove.6B.zip', filename="glove_zip", reporthook=reporthook) - - - print('unzipping glove file') - zip_ref = zipfile.ZipFile(glove_zip, 'r') - zip_ref.extractall(data_dir_path) - zip_ref.close() - - -def load_glove(data_dir_path=None): - if data_dir_path is None: - data_dir_path = 'very_large_data' - download_glove(data_dir_path) - _word2em = {} - glove_model_path = data_dir_path + "/glove.6B." + str(GLOVE_EMBEDDING_SIZE) + "d.txt" - file = open(glove_model_path, mode='rt', encoding='utf8') - for line in file: - words = line.strip().split() - word = words[0] - embeds = np.array(words[1:], dtype=np.float32) - _word2em[word] = embeds - file.close() - return _word2em - - -def glove_zero_emb(): - return np.zeros(shape=GLOVE_EMBEDDING_SIZE) - - -class Glove(object): - - word2em = None - - GLOVE_EMBEDDING_SIZE = GLOVE_EMBEDDING_SIZE - - def __init__(self): - self.word2em = load_glove() +try: + import urllib.request +except: + import urllib +import os +import sys +import zipfile +import numpy as np + +GLOVE_EMBEDDING_SIZE = 100 + + +def reporthook(block_num, block_size, total_size): + read_so_far = block_num * block_size + if total_size > 0: + percent = read_so_far * 1e2 / total_size + s = "\r%5.1f%% %*d / %d" % ( + percent, len(str(total_size)), read_so_far, total_size) + sys.stderr.write(s) + if read_so_far >= total_size: # near the end + sys.stderr.write("\n") + else: # total size is unknown + sys.stderr.write("read %d\n" % (read_so_far,)) + + +def download_glove(data_dir_path=None): + if data_dir_path is None: + data_dir_path = 'very_large_data' + glove_model_path = data_dir_path + "/glove.6B." + str(GLOVE_EMBEDDING_SIZE) + "d.txt" + if not os.path.exists(glove_model_path): + + glove_zip = data_dir_path + '/glove.6B.zip' + + if not os.path.exists(data_dir_path): + os.makedirs(data_dir_path) + + if not os.path.exists(glove_zip): + print('glove file does not exist, downloading from internet') + if sys.version[0]=="3": + urllib.request.urlretrieve(url='http://nlp.stanford.edu/data/glove.6B.zip', filename=glove_zip, + reporthook=reporthook) + elif sys.version[0]=="2": + urllib.urlretrieve(url='http://nlp.stanford.edu/data/glove.6B.zip', filename="glove_zip", reporthook=reporthook) + + print('unzipping glove file') + zip_ref = zipfile.ZipFile(glove_zip, 'r') + zip_ref.extractall(data_dir_path) + zip_ref.close() + + +def load_glove(data_dir_path=None): + if data_dir_path is None: + data_dir_path = 'very_large_data' + download_glove(data_dir_path) + _word2em = {} + glove_model_path = data_dir_path + "/glove.6B." + str(GLOVE_EMBEDDING_SIZE) + "d.txt" + file = open(glove_model_path, mode='rt', encoding='utf8') + for line in file: + words = line.strip().split() + word = words[0] + embeds = np.array(words[1:], dtype=np.float32) + _word2em[word] = embeds + file.close() + return _word2em + + +def glove_zero_emb(): + return np.zeros(shape=GLOVE_EMBEDDING_SIZE) + + +class Glove(object): + + word2em = None + + GLOVE_EMBEDDING_SIZE = GLOVE_EMBEDDING_SIZE + + def __init__(self): + self.word2em = load_glove() diff --git a/keras_text_summarization/library/utility/plot_utils.py b/keras_text_summarization/library/utility/plot_utils.py index 25046bf..ff6d569 100644 --- a/keras_text_summarization/library/utility/plot_utils.py +++ b/keras_text_summarization/library/utility/plot_utils.py @@ -1,106 +1,106 @@ -from matplotlib import pyplot as plt -import numpy as np -import itertools - - -def plot_confusion_matrix(cm, classes, - normalize=False, - title='Confusion matrix', - cmap=plt.cm.Blues): - """ - See full source and example: - http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html - - This function prints and plots the confusion matrix. - Normalization can be applied by setting `normalize=True`. - """ - plt.imshow(cm, interpolation='nearest', cmap=cmap) - plt.title(title) - plt.colorbar() - tick_marks = np.arange(len(classes)) - plt.xticks(tick_marks, classes, rotation=45) - plt.yticks(tick_marks, classes) - - if normalize: - cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] - print("Normalized confusion matrix") - else: - print('Confusion matrix, without normalization') - - thresh = cm.max() / 2. - for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): - plt.text(j, i, cm[i, j], - horizontalalignment="center", - color="white" if cm[i, j] > thresh else "black") - - plt.tight_layout() - plt.ylabel('True label') - plt.xlabel('Predicted label') - plt.show() - - -def most_informative_feature_for_binary_classification(vectorizer, classifier, n=100): - """ - See: https://stackoverflow.com/a/26980472 - - Identify most important features if given a vectorizer and binary classifier. Set n to the number - of weighted features you would like to show. (Note: current implementation merely prints and does not - return top classes.) - """ - - class_labels = classifier.classes_ - feature_names = vectorizer.get_feature_names() - topn_class1 = sorted(zip(classifier.coef_[0], feature_names))[:n] - topn_class2 = sorted(zip(classifier.coef_[0], feature_names))[-n:] - - for coef, feat in topn_class1: - print(class_labels[0], coef, feat) - - print() - - for coef, feat in reversed(topn_class2): - print(class_labels[1], coef, feat) - - -def plot_history_2win(history): - plt.subplot(211) - plt.title('Accuracy') - plt.plot(history.history['acc'], color='g', label='Train') - plt.plot(history.history['val_acc'], color='b', label='Validation') - plt.legend(loc='best') - - plt.subplot(212) - plt.title('Loss') - plt.plot(history.history['loss'], color='g', label='Train') - plt.plot(history.history['val_loss'], color='b', label='Validation') - plt.legend(loc='best') - - plt.tight_layout() - plt.show() - - -def create_history_plot(history, model_name, metrics=None): - plt.title('Accuracy and Loss (' + model_name + ')') - if metrics is None: - metrics = {'acc', 'loss'} - if 'acc' in metrics: - plt.plot(history.history['acc'], color='g', label='Train Accuracy') - plt.plot(history.history['val_acc'], color='b', label='Validation Accuracy') - if 'loss' in metrics: - plt.plot(history.history['loss'], color='r', label='Train Loss') - plt.plot(history.history['val_loss'], color='m', label='Validation Loss') - plt.legend(loc='best') - - plt.tight_layout() - - -def plot_history(history, model_name): - create_history_plot(history, model_name) - plt.show() - - -def plot_and_save_history(history, model_name, file_path, metrics=None): - if metrics is None: - metrics = {'acc', 'loss'} - create_history_plot(history, model_name, metrics) - plt.savefig(file_path) +from matplotlib import pyplot as plt +import numpy as np +import itertools + + +def plot_confusion_matrix(cm, classes, + normalize=False, + title='Confusion matrix', + cmap=plt.cm.Blues): + """ + See full source and example: + http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html + + This function prints and plots the confusion matrix. + Normalization can be applied by setting `normalize=True`. + """ + plt.imshow(cm, interpolation='nearest', cmap=cmap) + plt.title(title) + plt.colorbar() + tick_marks = np.arange(len(classes)) + plt.xticks(tick_marks, classes, rotation=45) + plt.yticks(tick_marks, classes) + + if normalize: + cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] + print("Normalized confusion matrix") + else: + print('Confusion matrix, without normalization') + + thresh = cm.max() / 2. + for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): + plt.text(j, i, cm[i, j], + horizontalalignment="center", + color="white" if cm[i, j] > thresh else "black") + + plt.tight_layout() + plt.ylabel('True label') + plt.xlabel('Predicted label') + plt.show() + + +def most_informative_feature_for_binary_classification(vectorizer, classifier, n=100): + """ + See: https://stackoverflow.com/a/26980472 + + Identify most important features if given a vectorizer and binary classifier. Set n to the number + of weighted features you would like to show. (Note: current implementation merely prints and does not + return top classes.) + """ + + class_labels = classifier.classes_ + feature_names = vectorizer.get_feature_names() + topn_class1 = sorted(zip(classifier.coef_[0], feature_names))[:n] + topn_class2 = sorted(zip(classifier.coef_[0], feature_names))[-n:] + + for coef, feat in topn_class1: + print(class_labels[0], coef, feat) + + print() + + for coef, feat in reversed(topn_class2): + print(class_labels[1], coef, feat) + + +def plot_history_2win(history): + plt.subplot(211) + plt.title('Accuracy') + plt.plot(history.history['acc'], color='g', label='Train') + plt.plot(history.history['val_acc'], color='b', label='Validation') + plt.legend(loc='best') + + plt.subplot(212) + plt.title('Loss') + plt.plot(history.history['loss'], color='g', label='Train') + plt.plot(history.history['val_loss'], color='b', label='Validation') + plt.legend(loc='best') + + plt.tight_layout() + plt.show() + + +def create_history_plot(history, model_name, metrics=None): + plt.title('Accuracy and Loss (' + model_name + ')') + if metrics is None: + metrics = {'acc', 'loss'} + if 'acc' in metrics: + plt.plot(history.history['acc'], color='g', label='Train Accuracy') + plt.plot(history.history['val_acc'], color='b', label='Validation Accuracy') + if 'loss' in metrics: + plt.plot(history.history['loss'], color='r', label='Train Loss') + plt.plot(history.history['val_loss'], color='m', label='Validation Loss') + plt.legend(loc='best') + + plt.tight_layout() + + +def plot_history(history, model_name): + create_history_plot(history, model_name) + plt.show() + + +def plot_and_save_history(history, model_name, file_path, metrics=None): + if metrics is None: + metrics = {'acc', 'loss'} + create_history_plot(history, model_name, metrics) + plt.savefig(file_path) From 62d3ee8a6cb61615d3b905543fe57d51dbb69f28 Mon Sep 17 00:00:00 2001 From: bikramkhastgir <7278304+bikramkhastgir@users.noreply.github.com> Date: Mon, 24 Dec 2018 06:55:35 +0530 Subject: [PATCH 9/9] Add files via upload --- .../library/utility/text_utils.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/keras_text_summarization/library/utility/text_utils.py b/keras_text_summarization/library/utility/text_utils.py index 6fcec3e..ed3da57 100644 --- a/keras_text_summarization/library/utility/text_utils.py +++ b/keras_text_summarization/library/utility/text_utils.py @@ -1,9 +1,9 @@ -WHITELIST = 'abcdefghijklmnopqrstuvwxyz1234567890?.,' - - -def in_white_list(_word): - for char in _word: - if char in WHITELIST: - return True - - return False +WHITELIST = 'abcdefghijklmnopqrstuvwxyz1234567890?.,' + + +def in_white_list(_word): + for char in _word: + if char in WHITELIST: + return True + + return False