From 9c8fc8c7e70d6e6889c5ee292ab7d8ed2dfd103a Mon Sep 17 00:00:00 2001
From: taranehkhosrojerdi
 <87570595+taranehkhosrojerdi@users.noreply.github.com>
Date: Mon, 1 Jul 2024 22:20:06 +0330
Subject: [PATCH] add phase 4 edited files

---
 bonus_phase/main.py  | 219 +++++++++++++++++++++++++++++++++++++++++++
 bonus_phase/utils.py | 131 ++++++++++++++++++++++++++
 2 files changed, 350 insertions(+)
 create mode 100644 bonus_phase/utils.py
diff --git a/bonus_phase/main.py b/bonus_phase/main.py
index 8b13789..b9e8dca 100644
--- a/bonus_phase/main.py
+++ b/bonus_phase/main.py
@@ -1 +1,220 @@
+import streamlit as st
+import sys
+sys.path.append(r"D:\University\MIR_project_Spring2024-Phase-1.2")
+import os
+from Logic import utils
+import time
+from enum import Enum
+import random
+from Logic.core.snippet import Snippet
+from Logic.core.preprocess import Preprocessor
 
+snippet_obj = Snippet(
+    number_of_words_on_each_side=5
+)
+
+class color(Enum):
+    RED = "#00BFFF"   # Light blue
+    GREEN = "#00CED1" # Cyan
+    BLUE = "#1E90FF"  # Light blue
+    YELLOW = "#00FFFF" # Cyan
+    PURPLE = "#ADD8E6"  # Light blue
+    ORANGE = "#87CEEB"  # Light blue
+    CYAN = "#F0FFFF"  # Light cyan
+    MAGENTA = "#E0FFFF"  # Light cyan
+
+def get_summary_with_snippet(movie_info, query):
+    summary = movie_info["first_page_summary"]
+    snippet, not_exist_words = snippet_obj.find_snippet(summary, query)
+    if "***" in snippet:
+        snippet = snippet.split()
+        for i in range(len(snippet)):
+            current_word = snippet[i]
+            if current_word.startswith("***") and current_word.endswith("***"):
+                current_word_without_star = current_word[3:-3]
+                summary = summary.lower().replace(
+                    current_word_without_star,
+                    f"<b><font size='4' color={random.choice(list(color)).value}>{current_word_without_star}</font></b>",
+                )
+    return summary
+
+def search_time(start, end):
+    st.success("Search took: {:.6f} milli-seconds".format((end - start) * 1e3))
+
+def toggle_star_state(movie_id):
+    # session_state = st.session_state
+    # if 'star_states' not in session_state:
+    #     session_state.star_states = {}
+    
+    # if movie_id in session_state.star_states:
+    #     session_state.star_states[movie_id] = not session_state.star_states[movie_id]
+    # else:
+    #     session_state.star_states[movie_id] = True
+    pass
+
+def search_handling(
+    search_button,
+    search_term,
+    search_max_num,
+    search_weights,
+    search_method,
+):
+    if search_button:
+        spell_correction_dataset = [summary for movie in utils.movies_dataset for summary in movie["summaries"]]
+        
+        spell_correction_dataset = Preprocessor(spell_correction_dataset).preprocess()
+        corrected_query = utils.correct_text(search_term, spell_correction_dataset)
+
+        if corrected_query != search_term:
+            st.warning(f"Your search terms were corrected to: {corrected_query}")
+            search_term = corrected_query
+
+        with st.spinner("Searching..."):
+            time.sleep(0.5)
+            start_time = time.time()
+            result = utils.search(
+                search_term,
+                search_max_num,
+                search_method,
+                search_weights,
+            )
+            print(f"Result: {result}")
+            end_time = time.time()
+            if len(result) == 0:
+                st.warning("No results found!")
+                return
+
+            search_time(start_time, end_time)
+
+            for i in range(len(result)):
+                card = st.columns([3, 1])
+                info = utils.get_movie_by_id(result[i][0], utils.movies_dataset)
+                movie_id = result[i][0]
+                relevance_score = result[i][1]
+                 
+                with card[0].container():
+                    title_string = f"[{info['title']}]({info['URL']}) - {info['score']}"
+                    st.title(title_string)
+                    
+                    if relevance_score > 0:
+                        st.markdown("<span style='font-size: 20px; color: yellow;'>⭐</span>", unsafe_allow_html=True)
+
+                    st.markdown(
+                        f"<b><font size = '4'>Summary:</font></b> {get_summary_with_snippet(info, search_term)}",
+                        unsafe_allow_html=True,
+                    )
+
+                with st.container():
+                    with st.expander("Details"):
+                        st.write(f"Relevance Score: {result[i][1]}")
+                        st.markdown("**Directors:**")
+                        if info["directors"] is not None:
+                            for director in info["directors"]:
+                                st.text(director)
+
+                        st.markdown("**Stars:**")
+                        stars = ", ".join(info["stars"])
+                        st.text(stars)
+
+                        st.markdown("**Genres:**")
+                        genre_colors = iter([color.RED.value, color.GREEN.value, color.BLUE.value, color.YELLOW.value, color.PURPLE.value, color.ORANGE.value, color.CYAN.value, color.MAGENTA.value])
+                        for genre in info["genres"]:
+                            genre_color = next(genre_colors)
+                            st.markdown(
+                                f"<span style='color:{genre_color}'>{genre}</span>",
+                                unsafe_allow_html=True,
+                            )
+                with card[1].container():
+                    st.image(info["Image_URL"], use_column_width=True)
+
+                st.divider()
+
+def main():
+    st.title("IMDB Movie Search Engine")
+    st.write(
+        "Search through IMDB dataset and find the most relevant movies to your search terms."
+    )
+    st.markdown(
+        '<span style="color:yellow">Developed By: MIR Team at Sharif University</span>',
+        unsafe_allow_html=True,
+    )
+
+    search_term = st.text_input("Search Term", help="Enter the term you want to search for.")
+    
+    with st.sidebar:
+        st.header("Advanced Search")
+        search_max_num = st.number_input(
+            "Maximum number of results", min_value=5, max_value=100, value=10, step=5, help="Set the maximum number of search results."
+        )
+        weight_stars = st.slider(
+            "Weight of stars in search",
+            min_value=0.0,
+            max_value=1.0,
+            value=1.0,
+            step=0.1,
+            help="Adjust the weight given to stars in the search results."
+        )
+
+        weight_genres = st.slider(
+            "Weight of genres in search",
+            min_value=0.0,
+            max_value=1.0,
+            value=1.0,
+            step=0.1,
+            help="Adjust the weight given to genres in the search results."
+        )
+
+        weight_summary = st.slider(
+            "Weight of summary in search",
+            min_value=0.0,
+            max_value=1.0,
+            value=1.0,
+            step=0.1,
+            help="Adjust the weight given to the summary in the search results."
+        )
+
+        search_weights = [weight_stars, weight_genres, weight_summary]
+        search_method = st.selectbox(
+            "Search method",
+            ("ltn.lnn", "ltc.lnc", "OkapiBM25"),
+            help="Choose the search method."
+        )
+
+    search_button = st.button("Search", key="search_button")
+
+    search_handling(
+        search_button,
+        search_term,
+        search_max_num,
+        search_weights,
+        search_method,
+    )
+
+    # Custom CSS for search button
+    custom_css = """
+    <style>
+    .stButton button {
+        background-color: #4CAF50;
+        color: white;
+        padding: 10px 20px;
+        text-align: center;
+        text-decoration: none;
+        display: inline-block;
+        font-size: 16px;
+        margin: 4px 2px;
+        cursor: pointer;
+        border-radius: 8px;
+        border: none;
+        transition-duration: 0.4s;
+    }
+
+    .stButton button:hover {
+        background-color: #3e8e41;
+        color: white;
+    }
+    </style>
+    """
+    st.markdown(custom_css, unsafe_allow_html=True)
+
+if __name__ == "__main__":
+    main()
diff --git a/bonus_phase/utils.py b/bonus_phase/utils.py
new file mode 100644
index 0000000..6aaa8b9
--- /dev/null
+++ b/bonus_phase/utils.py
@@ -0,0 +1,131 @@
+from typing import Dict, List
+from .core.search import SearchEngine
+from .core.spell_correction import SpellCorrection
+from .core.preprocess import Preprocessor
+from .core.snippet import Snippet
+from .core.indexer.indexes_enum import Indexes, Index_types
+import json
+
+movies_dataset = json.load(open("../Logic/tests/IMDB_crawled.json", "r"))  # TODO
+search_engine = SearchEngine()
+
+
+def correct_text(text: str, all_documents: List[str]) -> str:
+    """
+    Correct the give query text, if it is misspelled using Jacard similarity
+
+    Paramters
+    ---------
+    text: str
+        The query text
+    all_documents : list of str
+        The input documents.
+
+    Returns
+    str
+        The corrected form of the given text
+    """
+    # TODO: You can add any preprocessing steps here, if needed!
+    text = Preprocessor([text]).preprocess()[0]
+    # TODO: uncomment for spell correction
+    # spell_correction_obj = SpellCorrection(all_documents)
+    # new_text = ""
+    # for word in text.split():
+    #     new_text += spell_correction_obj.spell_check(word) + " "
+    # text = new_text
+    return text
+
+
+def search(
+    query: str,
+    max_result_count: int,
+    method: str = "ltn-lnn",
+    weights: list = [0.3, 0.3, 0.4],
+    should_print=False,
+    preferred_genre: str = None,
+):
+    """
+    Finds relevant documents to query
+
+    Parameters
+    ---------------------------------------------------------------------------------------------------
+    max_result_count: Return top 'max_result_count' docs which have the highest scores.
+                      notice that if max_result_count = -1, then you have to return all docs
+
+    mode: 'detailed' for searching in title and text separately.
+          'overall' for all words, and weighted by where the word appears on.
+
+    where: when mode ='detailed', when we want search query
+            in title or text not both of them at the same time.
+
+    method: 'ltn.lnn' or 'ltc.lnc' or 'OkapiBM25'
+
+    preferred_genre: A list containing preference rates for each genre. If None, the preference rates are equal.
+
+    Returns
+    ----------------------------------------------------------------------------------------------------
+    list
+    Retrieved documents with snippet
+    """
+    dict_weights = {'stars': weights[0], 'genres': weights[1], 'summaries': weights[2]}  # TODO
+    return search_engine.search(
+        query, method, dict_weights, max_results=max_result_count, safe_ranking=True
+    )
+
+import os
+
+def get_movie_by_id(id: str, movies_dataset: List[Dict[str, str]]) -> Dict[str, str]:
+    """
+    Get movie by its id
+
+    Parameters
+    ---------------------------------------------------------------------------------------------------
+    id: str
+        The id of the movie
+
+    movies_dataset: List[Dict[str, str]]
+        The dataset of movies
+
+    Returns
+    ----------------------------------------------------------------------------------------------------
+    dict
+        The movie with the given id
+    """
+    # TODO: self-added code
+    for movie in movies_dataset:
+        if movie["id"] == id:
+            result = movie
+            break
+    
+    # result = movies_dataset.get(
+    #     id,
+    #     {
+    #         "Title": "This is movie's title",
+    #         "Summary": "This is a summary",
+    #         "URL": "https://www.imdb.com/title/tt0111161/",
+    #         "Cast": ["Morgan Freeman", "Tim Robbins"],
+    #         "Genres": ["Drama", "Crime"],
+    #         "Image_URL": "https://m.media-amazon.com/images/M/MV5BNDE3ODcxYzMtY2YzZC00NmNlLWJiNDMtZDViZWM2MzIxZDYwXkEyXkFqcGdeQXVyNjAwNDUxODI@._V1_.jpg",
+    #     },
+    # )
+    
+    import requests
+
+
+    url = "http://www.omdbapi.com/"
+    params = {
+        "i": result['id'],
+        "apikey": "6ee1c49f" 
+    }
+
+    response = requests.get(url, params=params)
+
+    if response.status_code == 200:
+        movie_data = response.json()
+    result["Image_URL"] = movie_data["Poster"]
+    result["score"] = movie_data["Ratings"][0]["Value"]
+
+    result["URL"] = (  
+        f"https://www.imdb.com/title/{result['id']}"  # The url pattern of IMDb movies
+    )
+    return result