From 2280d1ba8b451b9b6cb2a7c209bb11d42df2f2de Mon Sep 17 00:00:00 2001 From: Chen Wang Date: Thu, 16 May 2024 12:50:38 -0500 Subject: [PATCH 1/6] add youtube to historgram --- containerized_analytics/smile/histogram/CHANGELOG.md | 10 ++++++++++ .../smile/histogram/rabbitmq_handler.py | 9 +++++++++ 2 files changed, 19 insertions(+) create mode 100644 containerized_analytics/smile/histogram/CHANGELOG.md diff --git a/containerized_analytics/smile/histogram/CHANGELOG.md b/containerized_analytics/smile/histogram/CHANGELOG.md new file mode 100644 index 00000000..cfbd8a8f --- /dev/null +++ b/containerized_analytics/smile/histogram/CHANGELOG.md @@ -0,0 +1,10 @@ +# Changelog +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [0.1.2] - 2024-05-16 + +### Changed +- Add YouTube videos as source to the histogram [#118](https://github.com/ncsa/standalone-smm-analytics/issues/118) diff --git a/containerized_analytics/smile/histogram/rabbitmq_handler.py b/containerized_analytics/smile/histogram/rabbitmq_handler.py index 2d7c80df..f45a5723 100644 --- a/containerized_analytics/smile/histogram/rabbitmq_handler.py +++ b/containerized_analytics/smile/histogram/rabbitmq_handler.py @@ -110,6 +110,15 @@ def rabbitmq_handler(ch, method, properties, body): interval = '1M' freq = count_freq(df, 'info.dateuploaded', interval, 's') + # youtube videos + elif 'snnipet.publishedAt' in df.columns: + # default at 1 month + if 'interval' in event: + interval = event['interval'] + else: + interval = '1M' + freq = count_freq(df, 'snnipet.publishedAt', interval, 's') + else: return {'url': 'null'} From e7acf92215d740f2c75369ac212164f541b6918f Mon Sep 17 00:00:00 2001 From: Chen Wang Date: Wed, 22 May 2024 12:04:13 -0500 Subject: [PATCH 2/6] 118 add youtube videos as source to histogram (#122) * typo * no need to specify unit for ISO 8601 format --- containerized_analytics/smile/histogram/histogram.py | 4 ++-- containerized_analytics/smile/histogram/rabbitmq_handler.py | 5 +++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/containerized_analytics/smile/histogram/histogram.py b/containerized_analytics/smile/histogram/histogram.py index 9510f041..5c2a99a4 100644 --- a/containerized_analytics/smile/histogram/histogram.py +++ b/containerized_analytics/smile/histogram/histogram.py @@ -67,9 +67,9 @@ def plot_freq(index, counts, interval, localPath, remotePath): return div_url -def count_freq(df, time_col_name, time_freq, time_unit): +def count_freq(df, time_col_name, time_freq, time_unit=None): # convert time column to datetime - df[time_col_name] = pd.to_datetime(df[time_col_name],unit=time_unit) + df[time_col_name] = pd.to_datetime(df[time_col_name], unit=time_unit) # set index to datetime df.set_index(df[time_col_name],inplace=True) diff --git a/containerized_analytics/smile/histogram/rabbitmq_handler.py b/containerized_analytics/smile/histogram/rabbitmq_handler.py index f45a5723..02407e63 100644 --- a/containerized_analytics/smile/histogram/rabbitmq_handler.py +++ b/containerized_analytics/smile/histogram/rabbitmq_handler.py @@ -111,15 +111,16 @@ def rabbitmq_handler(ch, method, properties, body): freq = count_freq(df, 'info.dateuploaded', interval, 's') # youtube videos - elif 'snnipet.publishedAt' in df.columns: + elif 'snippet.publishedAt' in df.columns: # default at 1 month if 'interval' in event: interval = event['interval'] else: interval = '1M' - freq = count_freq(df, 'snnipet.publishedAt', interval, 's') + freq = count_freq(df, 'snippet.publishedAt', interval) else: + print("No valid time column found") return {'url': 'null'} index = freq.index.tolist() From 8dc91378d3325cc841211e608b1a830b720a15b8 Mon Sep 17 00:00:00 2001 From: Chen Wang Date: Thu, 8 Aug 2024 14:17:26 -0500 Subject: [PATCH 3/6] filter out non-english (#124) * filter out non-english * add language dectection using spacy and pass in additional parameters --- .../smile/topic_modeling/CHANGELOG.md | 4 ++++ .../smile/topic_modeling/Dockerfile | 1 + .../smile/topic_modeling/algorithm.py | 13 ++++++++++- .../topic_modeling/gensim_topic_modeling.py | 23 ++++++++++++++++++- .../smile/topic_modeling/requirement.txt | 2 ++ 5 files changed, 41 insertions(+), 2 deletions(-) diff --git a/containerized_analytics/smile/topic_modeling/CHANGELOG.md b/containerized_analytics/smile/topic_modeling/CHANGELOG.md index 7d167e72..89c98d41 100644 --- a/containerized_analytics/smile/topic_modeling/CHANGELOG.md +++ b/containerized_analytics/smile/topic_modeling/CHANGELOG.md @@ -4,6 +4,10 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.1.6] - 07-16-2024 + +### Changed +- Add language detection to filter out non-English text [#123](https://github.com/ncsa/standalone-smm-analytics/issues/123) ## [0.1.5] - 01-23-2024 diff --git a/containerized_analytics/smile/topic_modeling/Dockerfile b/containerized_analytics/smile/topic_modeling/Dockerfile index 82fb3be7..477e3fdd 100644 --- a/containerized_analytics/smile/topic_modeling/Dockerfile +++ b/containerized_analytics/smile/topic_modeling/Dockerfile @@ -12,6 +12,7 @@ ENV RABBITMQ_HOST="rabbitmq" # install dependency libraries and download required data RUN pip install --no-cache-dir -r requirement.txt \ && python3 -m nltk.downloader -d /usr/local/share/nltk_data stopwords wordnet \ +&& python3 -m spacy download en_core_web_sm \ # cron job clean tmp folder && chmod u+x ./clear_cache.sh \ && chmod 0644 ./clear_cache_cron \ diff --git a/containerized_analytics/smile/topic_modeling/algorithm.py b/containerized_analytics/smile/topic_modeling/algorithm.py index 5b98ce85..a3c86643 100644 --- a/containerized_analytics/smile/topic_modeling/algorithm.py +++ b/containerized_analytics/smile/topic_modeling/algorithm.py @@ -12,7 +12,18 @@ def algorithm(df, params): output = {} - gensim_tm = Gensim_Topic_Modeling(df, column=params["column"]) + # Check if english_only and language_score exist in params + english_only_param = params["english_only"] if "english_only" in params else True + language_score_param = params["language_score"] if "language_score" in params else 0.9 + + # Call the Gensim_Topic_Modeling function + gensim_tm = Gensim_Topic_Modeling( + df, + column=params["column"], + english_only=english_only_param, + language_score=language_score_param + ) + data_lemmatized, id2word, corpus = gensim_tm.preprocessing() output['lemmatized'] = data_lemmatized diff --git a/containerized_analytics/smile/topic_modeling/gensim_topic_modeling.py b/containerized_analytics/smile/topic_modeling/gensim_topic_modeling.py index 356b573b..d96b0be3 100644 --- a/containerized_analytics/smile/topic_modeling/gensim_topic_modeling.py +++ b/containerized_analytics/smile/topic_modeling/gensim_topic_modeling.py @@ -7,15 +7,36 @@ from nltk import WordNetLemmatizer import pyLDAvis import pyLDAvis.gensim +import spacy +from spacy_langdetect import LanguageDetector +from spacy.language import Language class Gensim_Topic_Modeling: - def __init__(self, df, column): + def __init__(self, df, column, english_only=True, language_score=0.9): self.data = df[df[column] != ''][column].dropna().astype( 'str').tolist() + # Load a SpaCy model + self.nlp = spacy.load('en_core_web_sm') + + # Add the language detector to the pipeline + @Language.factory("language_detector") + def get_lang_detector(nlp, name): + return LanguageDetector() + + self.nlp.add_pipe('language_detector', last=True) + self.english_only = english_only + self.language_score = language_score + def preprocessing(self): + # Detect and keep only English texts + if self.english_only: + self.data = [sent for sent in self.data if + self.nlp(sent)._.language['language'] == 'en' + and self.nlp(sent)._.language['score'] > self.language_score] + self.data = [re.sub('\S*@\S*\s?', "", sent) for sent in self.data] self.data = [re.sub('\s+', ' ', sent) for sent in self.data] self.data = [re.sub("\'", "", sent) for sent in self.data] diff --git a/containerized_analytics/smile/topic_modeling/requirement.txt b/containerized_analytics/smile/topic_modeling/requirement.txt index ca7e73cf..f4962d9b 100644 --- a/containerized_analytics/smile/topic_modeling/requirement.txt +++ b/containerized_analytics/smile/topic_modeling/requirement.txt @@ -5,3 +5,5 @@ numpy>=1.18.1 pandas>=1.1.4 pyLDAvis==2.1.2 pika>=1.1.0 +spacy==3.7.5 +spacy-langdetect==0.1.2 From 7a36a1b16873f13fcd0cfa96217093e63913603a Mon Sep 17 00:00:00 2001 From: Chen Wang Date: Thu, 22 Aug 2024 15:52:36 -0500 Subject: [PATCH 4/6] 126 GitHub manual submission also takes in which analysis to run as parameter (#127) * update GH action * rewrite name? * update the workflow * typo --- .github/workflows/manual.yml | 154 +++++++---------------------------- 1 file changed, 29 insertions(+), 125 deletions(-) diff --git a/.github/workflows/manual.yml b/.github/workflows/manual.yml index 0962885a..b2917f05 100644 --- a/.github/workflows/manual.yml +++ b/.github/workflows/manual.yml @@ -4,24 +4,19 @@ on: workflow_dispatch: inputs: push_to_where: - description: 'Type either "dockerhub" or "github" to push to the respective registry' + description: 'Select the registry to push to' default: 'github' required: true - type: string - -env: - MAIN_REPO: ncsa/standalone-smm-analytics - -jobs: - # ---------------------------------------------------------------------- - # DOCKER BUILD - # ---------------------------------------------------------------------- - docker: - runs-on: ubuntu-latest - strategy: - fail-fast: true - matrix: - name: + type: choice + options: + - github + - dockerhub + specific_package: + description: 'Select the specific package to build' + default: 'sentiment_analysis' + required: true + type: choice + options: - autophrase - histogram - check_screen_name @@ -42,107 +37,16 @@ jobs: - clowder_create_space - clowder_list - clowder_upload_file - include: - - name: autophrase - dockerfile: Dockerfile - workdir: containerized_analytics/smile/autophrase - docker_repo_tag: socialmediamacroscope/autophrase - github_repo_tag: ncsa/autophrase - - name: histogram - dockerfile: Dockerfile - workdir: containerized_analytics/smile/histogram - docker_repo_tag: socialmediamacroscope/histogram - github_repo_tag: ncsa/histogram - - name: check_screen_name - dockerfile: Dockerfile - workdir: containerized_analytics/smile/check_screen_name - docker_repo_tag: socialmediamacroscope/check_screen_name - github_repo_tag: ncsa/check_screen_name - - name: classification_predict - dockerfile: Dockerfile - workdir: containerized_analytics/smile/classification_predict - docker_repo_tag: socialmediamacroscope/classification_predict - github_repo_tag: ncsa/classification_predict - - name: classification_train - dockerfile: Dockerfile - workdir: containerized_analytics/smile/classification_train - docker_repo_tag: socialmediamacroscope/classification_train - github_repo_tag: ncsa/classification_train - - name: classification_split - dockerfile: Dockerfile - workdir: containerized_analytics/smile/classification_split - docker_repo_tag: socialmediamacroscope/classification_split - github_repo_tag: ncsa/classification_split - - name: collect_reddit_comment - dockerfile: Dockerfile - workdir: containerized_analytics/smile/collect_reddit_comment - docker_repo_tag: socialmediamacroscope/collect_reddit_comment - github_repo_tag: ncsa/collect_reddit_comment - - name: crimson_hexagon_monitors - dockerfile: Dockerfile - workdir: containerized_analytics/smile/crimson_hexagon_monitors - docker_repo_tag: socialmediamacroscope/crimson_hexagon_monitors - github_repo_tag: ncsa/crimson_hexagon_monitors - - name: image_crawler - dockerfile: Dockerfile - workdir: containerized_analytics/smile/image_crawler - docker_repo_tag: socialmediamacroscope/image_crawler - github_repo_tag: ncsa/image_crawler - - name: name_entity_recognition - dockerfile: Dockerfile - workdir: containerized_analytics/smile/name_entity_recognition - docker_repo_tag: socialmediamacroscope/name_entity_recognition - github_repo_tag: ncsa/name_entity_recognition - - name: network_analysis - dockerfile: Dockerfile - workdir: containerized_analytics/smile/network_analysis - docker_repo_tag: socialmediamacroscope/network_analysis - github_repo_tag: ncsa/network_analysis - - name: preprocessing - dockerfile: Dockerfile - workdir: containerized_analytics/smile/preprocessing - docker_repo_tag: socialmediamacroscope/preprocessing - github_repo_tag: ncsa/preprocessing - - name: screen_name_prompt - dockerfile: Dockerfile - workdir: containerized_analytics/smile/screen_name_prompt - docker_repo_tag: socialmediamacroscope/screen_name_prompt - github_repo_tag: ncsa/screen_name_prompt - - name: sentiment_analysis - dockerfile: Dockerfile - workdir: containerized_analytics/smile/sentiment_analysis - docker_repo_tag: socialmediamacroscope/sentiment_analysis - github_repo_tag: ncsa/sentiment_analysis - - name: topic_modeling - dockerfile: Dockerfile - workdir: containerized_analytics/smile/topic_modeling - docker_repo_tag: socialmediamacroscope/topic_modeling - github_repo_tag: ncsa/topic_modeling - - name: clowder_create_collection - dockerfile: Dockerfile - workdir: containerized_analytics/clowder/clowder_create_collection - docker_repo_tag: socialmediamacroscope/clowder_create_collection - github_repo_tag: ncsa/clowder_create_collection - - name: clowder_create_dataset - dockerfile: Dockerfile - workdir: containerized_analytics/clowder/clowder_create_dataset - docker_repo_tag: socialmediamacroscope/clowder_create_dataset - github_repo_tag: ncsa/clowder_create_dataset - - name: clowder_create_space - dockerfile: Dockerfile - workdir: containerized_analytics/clowder/clowder_create_space - docker_repo_tag: socialmediamacroscope/clowder_create_space - github_repo_tag: ncsa/clowder_create_space - - name: clowder_list - dockerfile: Dockerfile - workdir: containerized_analytics/clowder/clowder_list - docker_repo_tag: socialmediamacroscope/clowder_list - github_repo_tag: ncsa/clowder_list - - name: clowder_upload_file - dockerfile: Dockerfile - workdir: containerized_analytics/clowder/clowder_upload_file - docker_repo_tag: socialmediamacroscope/clowder_upload_file - github_repo_tag: ncsa/clowder_upload_file + +env: + MAIN_REPO: ncsa/standalone-smm-analytics + +jobs: + # ---------------------------------------------------------------------- + # DOCKER BUILD + # ---------------------------------------------------------------------- + docker: + runs-on: ubuntu-latest steps: # checkout source code - uses: actions/checkout@v2 @@ -160,7 +64,7 @@ jobs: echo "GITHUB_BRANCH=${BRANCH}" >> $GITHUB_ENV if [ "$BRANCH" == "main" ]; then - CHANGELOG_FILE="${{ matrix.workdir }}/CHANGELOG.md" + CHANGELOG_FILE="containerized_analytics/smile/${{ inputs.specific_package }}/CHANGELOG.md" if [ -e "$CHANGELOG_FILE" ]; then VERSION=$(cat "$CHANGELOG_FILE" | grep -Eo '\[[0-9]+\.[0-9]+\.[0-9]+\]'| head -1 | tr -d '[]') VERSIONS="latest" @@ -203,10 +107,10 @@ jobs: env.HUBPUSH == 'github' || env.HUBPUSH == 'dockerhub' uses: elgohr/Publish-Docker-Github-Action@3.04 with: - dockerfile: ${{ matrix.dockerfile }} - name: ${{ matrix.docker_repo_tag }} + dockerfile: Dockerfile + name: socialmediamacroscope/${{ inputs.specific_package }} no_push: true - workdir: ${{ matrix.workdir }} + workdir: containerized_analytics/smile/${{ inputs.specific_package }} - name: Publish doc image to Docker Hub if: | @@ -214,11 +118,11 @@ jobs: uses: elgohr/Publish-Docker-Github-Action@3.04 with: registry: docker.io - name: ${{ matrix.docker_repo_tag }} + name: socialmediamacroscope/${{ inputs.specific_package }} username: ${{ secrets.HUB_USERNAME }} password: ${{ secrets.HUB_PASSWORD }} tags: "${{ env.TAGS }}" - workdir: ${{ matrix.workdir }} + workdir: containerized_analytics/smile/${{ inputs.specific_package }} - name: Publish doc image to Github if: | @@ -226,8 +130,8 @@ jobs: uses: elgohr/Publish-Docker-Github-Action@3.04 with: registry: ghcr.io - name: ${{ matrix.github_repo_tag }} + name: ncsa/${{ inputs.specific_package }} username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} tags: "${{ env.TAGS }}" - workdir: ${{ matrix.workdir }} + workdir: containerized_analytics/smile/${{ inputs.specific_package }} From 0eda1fafd2432712863172a1cfebdbd37bfb7f4d Mon Sep 17 00:00:00 2001 From: Chen Wang Date: Thu, 22 Aug 2024 17:03:05 -0500 Subject: [PATCH 5/6] fix id field and update changelog (#125) --- .../smile/preprocessing/CHANGELOG.md | 4 ++ .../smile/preprocessing/preprocessing.py | 47 ++++++------------- 2 files changed, 19 insertions(+), 32 deletions(-) diff --git a/containerized_analytics/smile/preprocessing/CHANGELOG.md b/containerized_analytics/smile/preprocessing/CHANGELOG.md index 7d167e72..aedf8dc9 100644 --- a/containerized_analytics/smile/preprocessing/CHANGELOG.md +++ b/containerized_analytics/smile/preprocessing/CHANGELOG.md @@ -4,6 +4,10 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.1.6] - 08-22-2024 + +### Fixed +- ID field not extracted correctly from the data source [#121](https://github.com/ncsa/standalone-smm-analytics/issues/121) ## [0.1.5] - 01-23-2024 diff --git a/containerized_analytics/smile/preprocessing/preprocessing.py b/containerized_analytics/smile/preprocessing/preprocessing.py index 6e520cd7..cc98a1e1 100644 --- a/containerized_analytics/smile/preprocessing/preprocessing.py +++ b/containerized_analytics/smile/preprocessing/preprocessing.py @@ -10,38 +10,21 @@ class Preprocess: def __init__(self, df, column): - self.id_column = "id" - if 'id_str' in df.columns: - self.id_column = 'id_str' - df_new = df[df[column] != ''][[self.id_column, column]].dropna() - sentences = df_new[column].astype('str').tolist() - self.id = df_new[self.id_column].astype('str').tolist() - elif 'id' in df.columns: - self.id_column = 'id' - df_new = df[df[column] != ''][[self.id_column, column]].dropna() - sentences = df_new[column].astype('str').tolist() - self.id = df_new[self.id_column].astype('str').tolist() - elif 'comment_id' in df.columns: - self.id_column = 'comment_id' - df_new = df[df[column] != ''][[self.id_column, column]].dropna() - sentences = df_new[column].astype('str').tolist() - self.id = df_new[self.id_column].astype('str').tolist() - elif '_source.id_str': - self.id_column = '_source.id_str' - df_new = df[df[column] != ''][[self.id_column, column]].dropna() - sentences = df_new[column].astype('str').tolist() - self.id = df_new[self.id_column].astype('str').tolist() - elif '_source.id': - self.id_column = '_source.id_str' - df_new = df[df[column] != ''][[self.id_column, column]].dropna() - sentences = df_new[column].astype('str').tolist() - self.id = df_new[self.id_column].astype('str').tolist() - else: - sentences = df[df[column] != ''][column].dropna().astype( - 'str').tolist() - self.id = [] - - sentences = [re.sub(r"http\S+", "", tweet) for tweet in sentences] + # Define potential id columns in order of precedence + potential_id_columns = ['id_str', 'id', 'comment_id', '_source.id_str', '_source.id'] + + # Find the first available id column from the potential list + self.id_column = next((col for col in potential_id_columns if col in df.columns), 'index') + + # If using index as the id_column, create a new column based on the index + if self.id_column == 'index': + df[self.id_column] = df.index.astype('str') + + # Filter the dataframe based on the column condition + df_new = df[df[column] != ''][[self.id_column, column]].dropna() + sentences = [re.sub(r"http\S+", "", str(tweet)) for tweet in df_new[column].tolist()] + + self.id = df_new[self.id_column].astype('str').tolist() self.sentences = sentences def get_phrases(self): From 422f10321370abfb265ea1c97b1962f1a3d9e2ca Mon Sep 17 00:00:00 2001 From: Chen Wang Date: Thu, 22 Aug 2024 17:05:40 -0500 Subject: [PATCH 6/6] update changelog --- CHANGELOG.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9d7a0f67..b82c994a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,11 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [Beta-2] - 08/31/2024 + +### Changed +- Github Manual action for more flexible docker image buliding and pushing [#126](https://github.com/ncsa/standalone-smm-analytics/issues/126) + ## [Beta-1] - 01-23-2024 ### Added