From 9c8fc8c7e70d6e6889c5ee292ab7d8ed2dfd103a Mon Sep 17 00:00:00 2001
From: taranehkhosrojerdi
<87570595+taranehkhosrojerdi@users.noreply.github.com>
Date: Mon, 1 Jul 2024 22:20:06 +0330
Subject: [PATCH] add phase 4 edited files
---
bonus_phase/main.py | 219 +++++++++++++++++++++++++++++++++++++++++++
bonus_phase/utils.py | 131 ++++++++++++++++++++++++++
2 files changed, 350 insertions(+)
create mode 100644 bonus_phase/utils.py
diff --git a/bonus_phase/main.py b/bonus_phase/main.py
index 8b13789..b9e8dca 100644
--- a/bonus_phase/main.py
+++ b/bonus_phase/main.py
@@ -1 +1,220 @@
+import streamlit as st
+import sys
+sys.path.append(r"D:\University\MIR_project_Spring2024-Phase-1.2")
+import os
+from Logic import utils
+import time
+from enum import Enum
+import random
+from Logic.core.snippet import Snippet
+from Logic.core.preprocess import Preprocessor
+snippet_obj = Snippet(
+ number_of_words_on_each_side=5
+)
+
+class color(Enum):
+ RED = "#00BFFF" # Light blue
+ GREEN = "#00CED1" # Cyan
+ BLUE = "#1E90FF" # Light blue
+ YELLOW = "#00FFFF" # Cyan
+ PURPLE = "#ADD8E6" # Light blue
+ ORANGE = "#87CEEB" # Light blue
+ CYAN = "#F0FFFF" # Light cyan
+ MAGENTA = "#E0FFFF" # Light cyan
+
+def get_summary_with_snippet(movie_info, query):
+ summary = movie_info["first_page_summary"]
+ snippet, not_exist_words = snippet_obj.find_snippet(summary, query)
+ if "***" in snippet:
+ snippet = snippet.split()
+ for i in range(len(snippet)):
+ current_word = snippet[i]
+ if current_word.startswith("***") and current_word.endswith("***"):
+ current_word_without_star = current_word[3:-3]
+ summary = summary.lower().replace(
+ current_word_without_star,
+ f"{current_word_without_star}",
+ )
+ return summary
+
+def search_time(start, end):
+ st.success("Search took: {:.6f} milli-seconds".format((end - start) * 1e3))
+
+def toggle_star_state(movie_id):
+ # session_state = st.session_state
+ # if 'star_states' not in session_state:
+ # session_state.star_states = {}
+
+ # if movie_id in session_state.star_states:
+ # session_state.star_states[movie_id] = not session_state.star_states[movie_id]
+ # else:
+ # session_state.star_states[movie_id] = True
+ pass
+
+def search_handling(
+ search_button,
+ search_term,
+ search_max_num,
+ search_weights,
+ search_method,
+):
+ if search_button:
+ spell_correction_dataset = [summary for movie in utils.movies_dataset for summary in movie["summaries"]]
+
+ spell_correction_dataset = Preprocessor(spell_correction_dataset).preprocess()
+ corrected_query = utils.correct_text(search_term, spell_correction_dataset)
+
+ if corrected_query != search_term:
+ st.warning(f"Your search terms were corrected to: {corrected_query}")
+ search_term = corrected_query
+
+ with st.spinner("Searching..."):
+ time.sleep(0.5)
+ start_time = time.time()
+ result = utils.search(
+ search_term,
+ search_max_num,
+ search_method,
+ search_weights,
+ )
+ print(f"Result: {result}")
+ end_time = time.time()
+ if len(result) == 0:
+ st.warning("No results found!")
+ return
+
+ search_time(start_time, end_time)
+
+ for i in range(len(result)):
+ card = st.columns([3, 1])
+ info = utils.get_movie_by_id(result[i][0], utils.movies_dataset)
+ movie_id = result[i][0]
+ relevance_score = result[i][1]
+
+ with card[0].container():
+ title_string = f"[{info['title']}]({info['URL']}) - {info['score']}"
+ st.title(title_string)
+
+ if relevance_score > 0:
+ st.markdown("⭐", unsafe_allow_html=True)
+
+ st.markdown(
+ f"Summary: {get_summary_with_snippet(info, search_term)}",
+ unsafe_allow_html=True,
+ )
+
+ with st.container():
+ with st.expander("Details"):
+ st.write(f"Relevance Score: {result[i][1]}")
+ st.markdown("**Directors:**")
+ if info["directors"] is not None:
+ for director in info["directors"]:
+ st.text(director)
+
+ st.markdown("**Stars:**")
+ stars = ", ".join(info["stars"])
+ st.text(stars)
+
+ st.markdown("**Genres:**")
+ genre_colors = iter([color.RED.value, color.GREEN.value, color.BLUE.value, color.YELLOW.value, color.PURPLE.value, color.ORANGE.value, color.CYAN.value, color.MAGENTA.value])
+ for genre in info["genres"]:
+ genre_color = next(genre_colors)
+ st.markdown(
+ f"{genre}",
+ unsafe_allow_html=True,
+ )
+ with card[1].container():
+ st.image(info["Image_URL"], use_column_width=True)
+
+ st.divider()
+
+def main():
+ st.title("IMDB Movie Search Engine")
+ st.write(
+ "Search through IMDB dataset and find the most relevant movies to your search terms."
+ )
+ st.markdown(
+ 'Developed By: MIR Team at Sharif University',
+ unsafe_allow_html=True,
+ )
+
+ search_term = st.text_input("Search Term", help="Enter the term you want to search for.")
+
+ with st.sidebar:
+ st.header("Advanced Search")
+ search_max_num = st.number_input(
+ "Maximum number of results", min_value=5, max_value=100, value=10, step=5, help="Set the maximum number of search results."
+ )
+ weight_stars = st.slider(
+ "Weight of stars in search",
+ min_value=0.0,
+ max_value=1.0,
+ value=1.0,
+ step=0.1,
+ help="Adjust the weight given to stars in the search results."
+ )
+
+ weight_genres = st.slider(
+ "Weight of genres in search",
+ min_value=0.0,
+ max_value=1.0,
+ value=1.0,
+ step=0.1,
+ help="Adjust the weight given to genres in the search results."
+ )
+
+ weight_summary = st.slider(
+ "Weight of summary in search",
+ min_value=0.0,
+ max_value=1.0,
+ value=1.0,
+ step=0.1,
+ help="Adjust the weight given to the summary in the search results."
+ )
+
+ search_weights = [weight_stars, weight_genres, weight_summary]
+ search_method = st.selectbox(
+ "Search method",
+ ("ltn.lnn", "ltc.lnc", "OkapiBM25"),
+ help="Choose the search method."
+ )
+
+ search_button = st.button("Search", key="search_button")
+
+ search_handling(
+ search_button,
+ search_term,
+ search_max_num,
+ search_weights,
+ search_method,
+ )
+
+ # Custom CSS for search button
+ custom_css = """
+
+ """
+ st.markdown(custom_css, unsafe_allow_html=True)
+
+if __name__ == "__main__":
+ main()
diff --git a/bonus_phase/utils.py b/bonus_phase/utils.py
new file mode 100644
index 0000000..6aaa8b9
--- /dev/null
+++ b/bonus_phase/utils.py
@@ -0,0 +1,131 @@
+from typing import Dict, List
+from .core.search import SearchEngine
+from .core.spell_correction import SpellCorrection
+from .core.preprocess import Preprocessor
+from .core.snippet import Snippet
+from .core.indexer.indexes_enum import Indexes, Index_types
+import json
+
+movies_dataset = json.load(open("../Logic/tests/IMDB_crawled.json", "r")) # TODO
+search_engine = SearchEngine()
+
+
+def correct_text(text: str, all_documents: List[str]) -> str:
+ """
+ Correct the give query text, if it is misspelled using Jacard similarity
+
+ Paramters
+ ---------
+ text: str
+ The query text
+ all_documents : list of str
+ The input documents.
+
+ Returns
+ str
+ The corrected form of the given text
+ """
+ # TODO: You can add any preprocessing steps here, if needed!
+ text = Preprocessor([text]).preprocess()[0]
+ # TODO: uncomment for spell correction
+ # spell_correction_obj = SpellCorrection(all_documents)
+ # new_text = ""
+ # for word in text.split():
+ # new_text += spell_correction_obj.spell_check(word) + " "
+ # text = new_text
+ return text
+
+
+def search(
+ query: str,
+ max_result_count: int,
+ method: str = "ltn-lnn",
+ weights: list = [0.3, 0.3, 0.4],
+ should_print=False,
+ preferred_genre: str = None,
+):
+ """
+ Finds relevant documents to query
+
+ Parameters
+ ---------------------------------------------------------------------------------------------------
+ max_result_count: Return top 'max_result_count' docs which have the highest scores.
+ notice that if max_result_count = -1, then you have to return all docs
+
+ mode: 'detailed' for searching in title and text separately.
+ 'overall' for all words, and weighted by where the word appears on.
+
+ where: when mode ='detailed', when we want search query
+ in title or text not both of them at the same time.
+
+ method: 'ltn.lnn' or 'ltc.lnc' or 'OkapiBM25'
+
+ preferred_genre: A list containing preference rates for each genre. If None, the preference rates are equal.
+
+ Returns
+ ----------------------------------------------------------------------------------------------------
+ list
+ Retrieved documents with snippet
+ """
+ dict_weights = {'stars': weights[0], 'genres': weights[1], 'summaries': weights[2]} # TODO
+ return search_engine.search(
+ query, method, dict_weights, max_results=max_result_count, safe_ranking=True
+ )
+
+import os
+
+def get_movie_by_id(id: str, movies_dataset: List[Dict[str, str]]) -> Dict[str, str]:
+ """
+ Get movie by its id
+
+ Parameters
+ ---------------------------------------------------------------------------------------------------
+ id: str
+ The id of the movie
+
+ movies_dataset: List[Dict[str, str]]
+ The dataset of movies
+
+ Returns
+ ----------------------------------------------------------------------------------------------------
+ dict
+ The movie with the given id
+ """
+ # TODO: self-added code
+ for movie in movies_dataset:
+ if movie["id"] == id:
+ result = movie
+ break
+
+ # result = movies_dataset.get(
+ # id,
+ # {
+ # "Title": "This is movie's title",
+ # "Summary": "This is a summary",
+ # "URL": "https://www.imdb.com/title/tt0111161/",
+ # "Cast": ["Morgan Freeman", "Tim Robbins"],
+ # "Genres": ["Drama", "Crime"],
+ # "Image_URL": "https://m.media-amazon.com/images/M/MV5BNDE3ODcxYzMtY2YzZC00NmNlLWJiNDMtZDViZWM2MzIxZDYwXkEyXkFqcGdeQXVyNjAwNDUxODI@._V1_.jpg",
+ # },
+ # )
+
+ import requests
+
+
+ url = "http://www.omdbapi.com/"
+ params = {
+ "i": result['id'],
+ "apikey": "6ee1c49f"
+ }
+
+ response = requests.get(url, params=params)
+
+ if response.status_code == 200:
+ movie_data = response.json()
+ result["Image_URL"] = movie_data["Poster"]
+ result["score"] = movie_data["Ratings"][0]["Value"]
+
+ result["URL"] = (
+ f"https://www.imdb.com/title/{result['id']}" # The url pattern of IMDb movies
+ )
+ return result