From 9c8fc8c7e70d6e6889c5ee292ab7d8ed2dfd103a Mon Sep 17 00:00:00 2001 From: taranehkhosrojerdi <87570595+taranehkhosrojerdi@users.noreply.github.com> Date: Mon, 1 Jul 2024 22:20:06 +0330 Subject: [PATCH] add phase 4 edited files --- bonus_phase/main.py | 219 +++++++++++++++++++++++++++++++++++++++++++ bonus_phase/utils.py | 131 ++++++++++++++++++++++++++ 2 files changed, 350 insertions(+) create mode 100644 bonus_phase/utils.py diff --git a/bonus_phase/main.py b/bonus_phase/main.py index 8b13789..b9e8dca 100644 --- a/bonus_phase/main.py +++ b/bonus_phase/main.py @@ -1 +1,220 @@ +import streamlit as st +import sys +sys.path.append(r"D:\University\MIR_project_Spring2024-Phase-1.2") +import os +from Logic import utils +import time +from enum import Enum +import random +from Logic.core.snippet import Snippet +from Logic.core.preprocess import Preprocessor +snippet_obj = Snippet( + number_of_words_on_each_side=5 +) + +class color(Enum): + RED = "#00BFFF" # Light blue + GREEN = "#00CED1" # Cyan + BLUE = "#1E90FF" # Light blue + YELLOW = "#00FFFF" # Cyan + PURPLE = "#ADD8E6" # Light blue + ORANGE = "#87CEEB" # Light blue + CYAN = "#F0FFFF" # Light cyan + MAGENTA = "#E0FFFF" # Light cyan + +def get_summary_with_snippet(movie_info, query): + summary = movie_info["first_page_summary"] + snippet, not_exist_words = snippet_obj.find_snippet(summary, query) + if "***" in snippet: + snippet = snippet.split() + for i in range(len(snippet)): + current_word = snippet[i] + if current_word.startswith("***") and current_word.endswith("***"): + current_word_without_star = current_word[3:-3] + summary = summary.lower().replace( + current_word_without_star, + f"{current_word_without_star}", + ) + return summary + +def search_time(start, end): + st.success("Search took: {:.6f} milli-seconds".format((end - start) * 1e3)) + +def toggle_star_state(movie_id): + # session_state = st.session_state + # if 'star_states' not in session_state: + # session_state.star_states = {} + + # if movie_id in session_state.star_states: + # session_state.star_states[movie_id] = not session_state.star_states[movie_id] + # else: + # session_state.star_states[movie_id] = True + pass + +def search_handling( + search_button, + search_term, + search_max_num, + search_weights, + search_method, +): + if search_button: + spell_correction_dataset = [summary for movie in utils.movies_dataset for summary in movie["summaries"]] + + spell_correction_dataset = Preprocessor(spell_correction_dataset).preprocess() + corrected_query = utils.correct_text(search_term, spell_correction_dataset) + + if corrected_query != search_term: + st.warning(f"Your search terms were corrected to: {corrected_query}") + search_term = corrected_query + + with st.spinner("Searching..."): + time.sleep(0.5) + start_time = time.time() + result = utils.search( + search_term, + search_max_num, + search_method, + search_weights, + ) + print(f"Result: {result}") + end_time = time.time() + if len(result) == 0: + st.warning("No results found!") + return + + search_time(start_time, end_time) + + for i in range(len(result)): + card = st.columns([3, 1]) + info = utils.get_movie_by_id(result[i][0], utils.movies_dataset) + movie_id = result[i][0] + relevance_score = result[i][1] + + with card[0].container(): + title_string = f"[{info['title']}]({info['URL']}) - {info['score']}" + st.title(title_string) + + if relevance_score > 0: + st.markdown("", unsafe_allow_html=True) + + st.markdown( + f"Summary: {get_summary_with_snippet(info, search_term)}", + unsafe_allow_html=True, + ) + + with st.container(): + with st.expander("Details"): + st.write(f"Relevance Score: {result[i][1]}") + st.markdown("**Directors:**") + if info["directors"] is not None: + for director in info["directors"]: + st.text(director) + + st.markdown("**Stars:**") + stars = ", ".join(info["stars"]) + st.text(stars) + + st.markdown("**Genres:**") + genre_colors = iter([color.RED.value, color.GREEN.value, color.BLUE.value, color.YELLOW.value, color.PURPLE.value, color.ORANGE.value, color.CYAN.value, color.MAGENTA.value]) + for genre in info["genres"]: + genre_color = next(genre_colors) + st.markdown( + f"{genre}", + unsafe_allow_html=True, + ) + with card[1].container(): + st.image(info["Image_URL"], use_column_width=True) + + st.divider() + +def main(): + st.title("IMDB Movie Search Engine") + st.write( + "Search through IMDB dataset and find the most relevant movies to your search terms." + ) + st.markdown( + 'Developed By: MIR Team at Sharif University', + unsafe_allow_html=True, + ) + + search_term = st.text_input("Search Term", help="Enter the term you want to search for.") + + with st.sidebar: + st.header("Advanced Search") + search_max_num = st.number_input( + "Maximum number of results", min_value=5, max_value=100, value=10, step=5, help="Set the maximum number of search results." + ) + weight_stars = st.slider( + "Weight of stars in search", + min_value=0.0, + max_value=1.0, + value=1.0, + step=0.1, + help="Adjust the weight given to stars in the search results." + ) + + weight_genres = st.slider( + "Weight of genres in search", + min_value=0.0, + max_value=1.0, + value=1.0, + step=0.1, + help="Adjust the weight given to genres in the search results." + ) + + weight_summary = st.slider( + "Weight of summary in search", + min_value=0.0, + max_value=1.0, + value=1.0, + step=0.1, + help="Adjust the weight given to the summary in the search results." + ) + + search_weights = [weight_stars, weight_genres, weight_summary] + search_method = st.selectbox( + "Search method", + ("ltn.lnn", "ltc.lnc", "OkapiBM25"), + help="Choose the search method." + ) + + search_button = st.button("Search", key="search_button") + + search_handling( + search_button, + search_term, + search_max_num, + search_weights, + search_method, + ) + + # Custom CSS for search button + custom_css = """ + + """ + st.markdown(custom_css, unsafe_allow_html=True) + +if __name__ == "__main__": + main() diff --git a/bonus_phase/utils.py b/bonus_phase/utils.py new file mode 100644 index 0000000..6aaa8b9 --- /dev/null +++ b/bonus_phase/utils.py @@ -0,0 +1,131 @@ +from typing import Dict, List +from .core.search import SearchEngine +from .core.spell_correction import SpellCorrection +from .core.preprocess import Preprocessor +from .core.snippet import Snippet +from .core.indexer.indexes_enum import Indexes, Index_types +import json + +movies_dataset = json.load(open("../Logic/tests/IMDB_crawled.json", "r")) # TODO +search_engine = SearchEngine() + + +def correct_text(text: str, all_documents: List[str]) -> str: + """ + Correct the give query text, if it is misspelled using Jacard similarity + + Paramters + --------- + text: str + The query text + all_documents : list of str + The input documents. + + Returns + str + The corrected form of the given text + """ + # TODO: You can add any preprocessing steps here, if needed! + text = Preprocessor([text]).preprocess()[0] + # TODO: uncomment for spell correction + # spell_correction_obj = SpellCorrection(all_documents) + # new_text = "" + # for word in text.split(): + # new_text += spell_correction_obj.spell_check(word) + " " + # text = new_text + return text + + +def search( + query: str, + max_result_count: int, + method: str = "ltn-lnn", + weights: list = [0.3, 0.3, 0.4], + should_print=False, + preferred_genre: str = None, +): + """ + Finds relevant documents to query + + Parameters + --------------------------------------------------------------------------------------------------- + max_result_count: Return top 'max_result_count' docs which have the highest scores. + notice that if max_result_count = -1, then you have to return all docs + + mode: 'detailed' for searching in title and text separately. + 'overall' for all words, and weighted by where the word appears on. + + where: when mode ='detailed', when we want search query + in title or text not both of them at the same time. + + method: 'ltn.lnn' or 'ltc.lnc' or 'OkapiBM25' + + preferred_genre: A list containing preference rates for each genre. If None, the preference rates are equal. + + Returns + ---------------------------------------------------------------------------------------------------- + list + Retrieved documents with snippet + """ + dict_weights = {'stars': weights[0], 'genres': weights[1], 'summaries': weights[2]} # TODO + return search_engine.search( + query, method, dict_weights, max_results=max_result_count, safe_ranking=True + ) + +import os + +def get_movie_by_id(id: str, movies_dataset: List[Dict[str, str]]) -> Dict[str, str]: + """ + Get movie by its id + + Parameters + --------------------------------------------------------------------------------------------------- + id: str + The id of the movie + + movies_dataset: List[Dict[str, str]] + The dataset of movies + + Returns + ---------------------------------------------------------------------------------------------------- + dict + The movie with the given id + """ + # TODO: self-added code + for movie in movies_dataset: + if movie["id"] == id: + result = movie + break + + # result = movies_dataset.get( + # id, + # { + # "Title": "This is movie's title", + # "Summary": "This is a summary", + # "URL": "https://www.imdb.com/title/tt0111161/", + # "Cast": ["Morgan Freeman", "Tim Robbins"], + # "Genres": ["Drama", "Crime"], + # "Image_URL": "https://m.media-amazon.com/images/M/MV5BNDE3ODcxYzMtY2YzZC00NmNlLWJiNDMtZDViZWM2MzIxZDYwXkEyXkFqcGdeQXVyNjAwNDUxODI@._V1_.jpg", + # }, + # ) + + import requests + + + url = "http://www.omdbapi.com/" + params = { + "i": result['id'], + "apikey": "6ee1c49f" + } + + response = requests.get(url, params=params) + + if response.status_code == 200: + movie_data = response.json() + result["Image_URL"] = movie_data["Poster"] + result["score"] = movie_data["Ratings"][0]["Value"] + + result["URL"] = ( + f"https://www.imdb.com/title/{result['id']}" # The url pattern of IMDb movies + ) + return result