ncsa · longshuicy · Jan 25, 2024 · May 16, 2024 · May 16, 2024 · May 22, 2024
diff --git a/.github/workflows/manual.yml b/.github/workflows/manual.yml
@@ -4,24 +4,19 @@ on:
   workflow_dispatch:
     inputs:
       push_to_where:
-        description: 'Type either "dockerhub" or "github" to push to the respective registry'
+        description: 'Select the registry to push to'
         default: 'github'
         required: true
-        type: string
-
-env:
-  MAIN_REPO: ncsa/standalone-smm-analytics
-
-jobs:
-  # ----------------------------------------------------------------------
-  # DOCKER BUILD
-  # ----------------------------------------------------------------------
-  docker:
-    runs-on: ubuntu-latest
-    strategy:
-      fail-fast: true
-      matrix:
-        name:
+        type: choice
+        options:
+          - github
+          - dockerhub
+      specific_package:
+        description: 'Select the specific package to build'
+        default: 'sentiment_analysis'
+        required: true
+        type: choice
+        options:
           - autophrase
           - histogram
           - check_screen_name
@@ -42,107 +37,16 @@ jobs:
           - clowder_create_space
           - clowder_list
           - clowder_upload_file
-        include:
-          - name: autophrase
-            dockerfile: Dockerfile
-            workdir: containerized_analytics/smile/autophrase
-            docker_repo_tag: socialmediamacroscope/autophrase
-            github_repo_tag: ncsa/autophrase
-          - name: histogram
-            dockerfile: Dockerfile
-            workdir: containerized_analytics/smile/histogram
-            docker_repo_tag: socialmediamacroscope/histogram
-            github_repo_tag: ncsa/histogram
-          - name: check_screen_name
-            dockerfile: Dockerfile
-            workdir: containerized_analytics/smile/check_screen_name
-            docker_repo_tag: socialmediamacroscope/check_screen_name
-            github_repo_tag: ncsa/check_screen_name
-          - name: classification_predict
-            dockerfile: Dockerfile
-            workdir: containerized_analytics/smile/classification_predict
-            docker_repo_tag: socialmediamacroscope/classification_predict
-            github_repo_tag: ncsa/classification_predict
-          - name: classification_train
-            dockerfile: Dockerfile
-            workdir: containerized_analytics/smile/classification_train
-            docker_repo_tag: socialmediamacroscope/classification_train
-            github_repo_tag: ncsa/classification_train
-          - name: classification_split
-            dockerfile: Dockerfile
-            workdir: containerized_analytics/smile/classification_split
-            docker_repo_tag: socialmediamacroscope/classification_split
-            github_repo_tag: ncsa/classification_split
-          - name: collect_reddit_comment
-            dockerfile: Dockerfile
-            workdir: containerized_analytics/smile/collect_reddit_comment
-            docker_repo_tag: socialmediamacroscope/collect_reddit_comment
-            github_repo_tag: ncsa/collect_reddit_comment
-          - name: crimson_hexagon_monitors
-            dockerfile: Dockerfile
-            workdir: containerized_analytics/smile/crimson_hexagon_monitors
-            docker_repo_tag: socialmediamacroscope/crimson_hexagon_monitors
-            github_repo_tag: ncsa/crimson_hexagon_monitors
-          - name: image_crawler
-            dockerfile: Dockerfile
-            workdir: containerized_analytics/smile/image_crawler
-            docker_repo_tag: socialmediamacroscope/image_crawler
-            github_repo_tag: ncsa/image_crawler
-          - name: name_entity_recognition
-            dockerfile: Dockerfile
-            workdir: containerized_analytics/smile/name_entity_recognition
-            docker_repo_tag: socialmediamacroscope/name_entity_recognition
-            github_repo_tag: ncsa/name_entity_recognition
-          - name: network_analysis
-            dockerfile: Dockerfile
-            workdir: containerized_analytics/smile/network_analysis
-            docker_repo_tag: socialmediamacroscope/network_analysis
-            github_repo_tag: ncsa/network_analysis
-          - name: preprocessing
-            dockerfile: Dockerfile
-            workdir: containerized_analytics/smile/preprocessing
-            docker_repo_tag: socialmediamacroscope/preprocessing
-            github_repo_tag: ncsa/preprocessing
-          - name: screen_name_prompt
-            dockerfile: Dockerfile
-            workdir: containerized_analytics/smile/screen_name_prompt
-            docker_repo_tag: socialmediamacroscope/screen_name_prompt
-            github_repo_tag: ncsa/screen_name_prompt
-          - name: sentiment_analysis
-            dockerfile: Dockerfile
-            workdir: containerized_analytics/smile/sentiment_analysis
-            docker_repo_tag: socialmediamacroscope/sentiment_analysis
-            github_repo_tag: ncsa/sentiment_analysis
-          - name: topic_modeling
-            dockerfile: Dockerfile
-            workdir: containerized_analytics/smile/topic_modeling
-            docker_repo_tag: socialmediamacroscope/topic_modeling
-            github_repo_tag: ncsa/topic_modeling
-          - name: clowder_create_collection
-            dockerfile: Dockerfile
-            workdir: containerized_analytics/clowder/clowder_create_collection
-            docker_repo_tag: socialmediamacroscope/clowder_create_collection
-            github_repo_tag: ncsa/clowder_create_collection
-          - name: clowder_create_dataset
-            dockerfile: Dockerfile
-            workdir: containerized_analytics/clowder/clowder_create_dataset
-            docker_repo_tag: socialmediamacroscope/clowder_create_dataset
-            github_repo_tag: ncsa/clowder_create_dataset
-          - name: clowder_create_space
-            dockerfile: Dockerfile
-            workdir: containerized_analytics/clowder/clowder_create_space
-            docker_repo_tag: socialmediamacroscope/clowder_create_space
-            github_repo_tag: ncsa/clowder_create_space
-          - name: clowder_list
-            dockerfile: Dockerfile
-            workdir: containerized_analytics/clowder/clowder_list
-            docker_repo_tag: socialmediamacroscope/clowder_list
-            github_repo_tag: ncsa/clowder_list
-          - name: clowder_upload_file
-            dockerfile: Dockerfile
-            workdir: containerized_analytics/clowder/clowder_upload_file
-            docker_repo_tag: socialmediamacroscope/clowder_upload_file
-            github_repo_tag: ncsa/clowder_upload_file
+
+env:
+  MAIN_REPO: ncsa/standalone-smm-analytics
+
+jobs:
+  # ----------------------------------------------------------------------
+  # DOCKER BUILD
+  # ----------------------------------------------------------------------
+  docker:
+    runs-on: ubuntu-latest
     steps:
       # checkout source code
       - uses: actions/checkout@v2
@@ -160,7 +64,7 @@ jobs:
           echo "GITHUB_BRANCH=${BRANCH}" >> $GITHUB_ENV
 
           if [ "$BRANCH" == "main" ]; then
-            CHANGELOG_FILE="${{ matrix.workdir }}/CHANGELOG.md"
+            CHANGELOG_FILE="containerized_analytics/smile/${{ inputs.specific_package }}/CHANGELOG.md"
             if [ -e "$CHANGELOG_FILE" ]; then
               VERSION=$(cat "$CHANGELOG_FILE" | grep -Eo '\[[0-9]+\.[0-9]+\.[0-9]+\]'| head -1 | tr -d '[]')
               VERSIONS="latest"
@@ -203,31 +107,31 @@ jobs:
           env.HUBPUSH == 'github' || env.HUBPUSH == 'dockerhub'
         uses: elgohr/Publish-Docker-Github-Action@3.04
         with:
-          dockerfile: ${{ matrix.dockerfile }}
-          name: ${{ matrix.docker_repo_tag }}
+          dockerfile: Dockerfile
+          name: socialmediamacroscope/${{ inputs.specific_package }}
           no_push: true
-          workdir: ${{ matrix.workdir }}
+          workdir: containerized_analytics/smile/${{ inputs.specific_package }}
 
       - name: Publish doc image to Docker Hub
         if: |
           env.HUBPUSH == 'dockerhub'
         uses: elgohr/Publish-Docker-Github-Action@3.04
         with:
           registry: docker.io
-          name: ${{ matrix.docker_repo_tag }}
+          name: socialmediamacroscope/${{ inputs.specific_package }}
           username: ${{ secrets.HUB_USERNAME }}
           password: ${{ secrets.HUB_PASSWORD }}
           tags: "${{ env.TAGS }}"
-          workdir: ${{ matrix.workdir }}
+          workdir: containerized_analytics/smile/${{ inputs.specific_package }}
 
       - name: Publish doc image to Github
         if: |
           env.HUBPUSH == 'github'
         uses: elgohr/Publish-Docker-Github-Action@3.04
         with:
           registry: ghcr.io
-          name: ${{ matrix.github_repo_tag }}
+          name: ncsa/${{ inputs.specific_package }}
           username: ${{ github.actor }}
           password: ${{ secrets.GITHUB_TOKEN }}
           tags: "${{ env.TAGS }}"
-          workdir: ${{ matrix.workdir }}
+          workdir: containerized_analytics/smile/${{ inputs.specific_package }}
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,11 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [Beta-2] - 08/31/2024
+
+### Changed
+- Github Manual action for more flexible docker image buliding and pushing [#126](https://github.com/ncsa/standalone-smm-analytics/issues/126)
+
 ## [Beta-1] - 01-23-2024
 
 ### Added

diff --git a/containerized_analytics/smile/histogram/CHANGELOG.md b/containerized_analytics/smile/histogram/CHANGELOG.md
@@ -0,0 +1,10 @@
+# Changelog
+All notable changes to this project will be documented in this file.
+
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
+and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+
+## [0.1.2] - 2024-05-16
+
+### Changed
+- Add YouTube videos as source to the histogram [#118](https://github.com/ncsa/standalone-smm-analytics/issues/118)
diff --git a/containerized_analytics/smile/histogram/histogram.py b/containerized_analytics/smile/histogram/histogram.py
@@ -67,9 +67,9 @@ def plot_freq(index, counts, interval, localPath, remotePath):
     return div_url
 
 
-def count_freq(df, time_col_name, time_freq, time_unit):
+def count_freq(df, time_col_name, time_freq, time_unit=None):
     # convert time column to datetime
-    df[time_col_name] = pd.to_datetime(df[time_col_name],unit=time_unit)
+    df[time_col_name] = pd.to_datetime(df[time_col_name], unit=time_unit)
     # set index to datetime
     df.set_index(df[time_col_name],inplace=True)
 

diff --git a/containerized_analytics/smile/histogram/rabbitmq_handler.py b/containerized_analytics/smile/histogram/rabbitmq_handler.py
@@ -110,7 +110,17 @@ def rabbitmq_handler(ch, method, properties, body):
                 interval = '1M'
             freq = count_freq(df, 'info.dateuploaded', interval, 's')
 
+        # youtube videos
+        elif 'snippet.publishedAt' in df.columns:
+            # default at 1 month
+            if 'interval' in event:
+                interval = event['interval']
+            else:
+                interval = '1M'
+            freq = count_freq(df, 'snippet.publishedAt', interval)
+
         else:
+            print("No valid time column found")
             return {'url': 'null'}
 
         index = freq.index.tolist()

diff --git a/containerized_analytics/smile/preprocessing/CHANGELOG.md b/containerized_analytics/smile/preprocessing/CHANGELOG.md
@@ -4,6 +4,10 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [0.1.6] - 08-22-2024
+
+### Fixed
+- ID field not extracted correctly from the data source [#121](https://github.com/ncsa/standalone-smm-analytics/issues/121)
 
 ## [0.1.5] - 01-23-2024
 

diff --git a/containerized_analytics/smile/preprocessing/preprocessing.py b/containerized_analytics/smile/preprocessing/preprocessing.py
@@ -10,38 +10,21 @@ class Preprocess:
 
     def __init__(self, df, column):
 
-        self.id_column = "id"
-        if 'id_str' in df.columns:
-            self.id_column = 'id_str'
-            df_new = df[df[column] != ''][[self.id_column, column]].dropna()
-            sentences = df_new[column].astype('str').tolist()
-            self.id = df_new[self.id_column].astype('str').tolist()
-        elif 'id' in df.columns:
-            self.id_column = 'id'
-            df_new = df[df[column] != ''][[self.id_column, column]].dropna()
-            sentences = df_new[column].astype('str').tolist()
-            self.id = df_new[self.id_column].astype('str').tolist()
-        elif 'comment_id' in df.columns:
-            self.id_column = 'comment_id'
-            df_new = df[df[column] != ''][[self.id_column, column]].dropna()
-            sentences = df_new[column].astype('str').tolist()
-            self.id = df_new[self.id_column].astype('str').tolist()
-        elif '_source.id_str':
-            self.id_column = '_source.id_str'
-            df_new = df[df[column] != ''][[self.id_column, column]].dropna()
-            sentences = df_new[column].astype('str').tolist()
-            self.id = df_new[self.id_column].astype('str').tolist()
-        elif '_source.id':
-            self.id_column = '_source.id_str'
-            df_new = df[df[column] != ''][[self.id_column, column]].dropna()
-            sentences = df_new[column].astype('str').tolist()
-            self.id = df_new[self.id_column].astype('str').tolist()
-        else:
-            sentences = df[df[column] != ''][column].dropna().astype(
-                'str').tolist()
-            self.id = []
-
-        sentences = [re.sub(r"http\S+", "", tweet) for tweet in sentences]
+        # Define potential id columns in order of precedence
+        potential_id_columns = ['id_str', 'id', 'comment_id', '_source.id_str', '_source.id']
+
+        # Find the first available id column from the potential list
+        self.id_column = next((col for col in potential_id_columns if col in df.columns), 'index')
+
+        # If using index as the id_column, create a new column based on the index
+        if self.id_column == 'index':
+            df[self.id_column] = df.index.astype('str')
+
+        # Filter the dataframe based on the column condition
+        df_new = df[df[column] != ''][[self.id_column, column]].dropna()
+        sentences = [re.sub(r"http\S+", "", str(tweet)) for tweet in df_new[column].tolist()]
+
+        self.id = df_new[self.id_column].astype('str').tolist()
         self.sentences = sentences
 
     def get_phrases(self):

diff --git a/containerized_analytics/smile/topic_modeling/CHANGELOG.md b/containerized_analytics/smile/topic_modeling/CHANGELOG.md
@@ -4,6 +4,10 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [0.1.6] - 07-16-2024
+
+### Changed
+- Add language detection to filter out non-English text [#123](https://github.com/ncsa/standalone-smm-analytics/issues/123)
 
 ## [0.1.5] - 01-23-2024
 

diff --git a/containerized_analytics/smile/topic_modeling/Dockerfile b/containerized_analytics/smile/topic_modeling/Dockerfile
@@ -12,6 +12,7 @@ ENV RABBITMQ_HOST="rabbitmq"
 # install dependency libraries and download required data
 RUN pip install --no-cache-dir -r requirement.txt \
 && python3 -m nltk.downloader -d /usr/local/share/nltk_data stopwords wordnet \
+&& python3 -m spacy download en_core_web_sm \
 # cron job clean tmp folder
 && chmod u+x ./clear_cache.sh \
 && chmod 0644 ./clear_cache_cron \

diff --git a/containerized_analytics/smile/topic_modeling/algorithm.py b/containerized_analytics/smile/topic_modeling/algorithm.py
@@ -12,7 +12,18 @@ def algorithm(df, params):
 
     output = {}
 
-    gensim_tm = Gensim_Topic_Modeling(df, column=params["column"])
+    # Check if english_only and language_score exist in params
+    english_only_param = params["english_only"] if "english_only" in params else True
+    language_score_param = params["language_score"] if "language_score" in params else 0.9
+
+    # Call the Gensim_Topic_Modeling function
+    gensim_tm = Gensim_Topic_Modeling(
+        df,
+        column=params["column"],
+        english_only=english_only_param,
+        language_score=language_score_param
+    )
+
     data_lemmatized, id2word, corpus = gensim_tm.preprocessing()
     output['lemmatized'] = data_lemmatized
 

diff --git a/containerized_analytics/smile/topic_modeling/gensim_topic_modeling.py b/containerized_analytics/smile/topic_modeling/gensim_topic_modeling.py
@@ -7,15 +7,36 @@
 from nltk import WordNetLemmatizer
 import pyLDAvis
 import pyLDAvis.gensim
+import spacy
+from spacy_langdetect import LanguageDetector
+from spacy.language import Language
 
 
 class Gensim_Topic_Modeling:
 
-    def __init__(self, df, column):
+    def __init__(self, df, column, english_only=True, language_score=0.9):
         self.data = df[df[column] != ''][column].dropna().astype(
             'str').tolist()
 
+        # Load a SpaCy model
+        self.nlp = spacy.load('en_core_web_sm')
+
+        # Add the language detector to the pipeline
+        @Language.factory("language_detector")
+        def get_lang_detector(nlp, name):
+            return LanguageDetector()
+
+        self.nlp.add_pipe('language_detector', last=True)
+        self.english_only = english_only
+        self.language_score = language_score
+
     def preprocessing(self):
+        # Detect and keep only English texts
+        if self.english_only:
+            self.data = [sent for sent in self.data if
+                    self.nlp(sent)._.language['language'] == 'en'
+                         and self.nlp(sent)._.language['score'] > self.language_score]
+
         self.data = [re.sub('\S*@\S*\s?', "", sent) for sent in self.data]
         self.data = [re.sub('\s+', ' ', sent) for sent in self.data]
         self.data = [re.sub("\'", "", sent) for sent in self.data]