Frameworks_Assignment/app.py at master · mokwathedeveloper/Frameworks_Assignment · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import os

# Import the data loading and processing function from analysis.py
# Assuming analysis.py is in the same directory
from analysis import load_and_process_data

# --- Streamlit Application ---
st.set_page_config(layout="wide", page_title="CORD-19 Research Paper Analysis")

st.title("CORD-19 Research Paper Analysis")
st.markdown("""
This application provides an interactive analysis of the CORD-19 (COVID-19 Open Research Dataset) metadata.
Explore trends in publication years, top journals, and common keywords in paper titles.
""")

@st.cache_data # Cache data to avoid reloading on every rerun
def get_data():
    return load_and_process_data()

df = get_data()

if df.empty:
    st.error("Could not load data. Please ensure 'metadata.csv' is in the project directory and try again.")
else:
    st.sidebar.header("Filter Options")

    # Year Range Slider
    min_year = int(df['year'].min()) if not df.empty else 2000
    max_year = int(df['year'].max()) if not df.empty else 2022
    year_range = st.sidebar.slider(
        "Select Year Range",
        min_value=min_year,
        max_value=max_year,
        value=(min_year, max_year)
    )

    # Journal Dropdown
    all_journals = ["All"] + sorted(df['journal'].unique().tolist())
    selected_journal = st.sidebar.selectbox("Select Journal", all_journals)

    # Apply filters
    filtered_df = df[(df['year'] >= year_range[0]) & (df['year'] <= year_range[1])]
    if selected_journal != "All":
        filtered_df = filtered_df[filtered_df['journal'] == selected_journal]

    st.header("Analysis Results")

    if filtered_df.empty:
        st.warning("No data available for the selected filters.")
    else:
        # --- Visualizations ---
        st.subheader("Publications Over Time")
        papers_by_year = filtered_df['year'].value_counts().sort_index()
        fig1, ax1 = plt.subplots(figsize=(10, 5))
        sns.lineplot(x=papers_by_year.index, y=papers_by_year.values, ax=ax1)
        ax1.set_title('Number of Publications Over Time')
        ax1.set_xlabel('Year')
        ax1.set_ylabel('Number of Papers')
        ax1.grid(True)
        st.pyplot(fig1)
        plt.close(fig1)

        st.subheader("Top Publishing Journals")
        top_journals = filtered_df['journal'].value_counts().head(10)
        if not top_journals.empty:
            fig2, ax2 = plt.subplots(figsize=(10, 6))
            sns.barplot(x=top_journals.values, y=top_journals.index, palette='viridis', ax=ax2)
            ax2.set_title('Top 10 Publishing Journals')
            ax2.set_xlabel('Number of Papers')
            ax2.set_ylabel('Journal')
            st.pyplot(fig2)
            plt.close(fig2)
        else:
            st.info("No journals found for the selected filters.")

        st.subheader("Word Cloud of Paper Titles")
        all_titles = ' '.join(filtered_df['title'].dropna().tolist())
        if all_titles:
            wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_titles)
            fig3, ax3 = plt.subplots(figsize=(10, 5))
            ax3.imshow(wordcloud, interpolation='bilinear')
            ax3.axis('off')
            ax3.set_title('Word Cloud of Paper Titles')
            st.pyplot(fig3)
            plt.close(fig3)
        else:
            st.info("No titles available for word cloud generation with the selected filters.")

        st.subheader("Distribution of Paper Counts by Source")
        if 'source_x' in filtered_df.columns:
            paper_sources = filtered_df['source_x'].value_counts().head(10)
            source_col = 'source_x'
        else:
            paper_sources = filtered_df['journal'].value_counts().head(10) # Fallback
            source_col = 'journal'

        if not paper_sources.empty:
            fig4, ax4 = plt.subplots(figsize=(10, 6))
            sns.barplot(x=paper_sources.values, y=paper_sources.index, palette='plasma', ax=ax4)
            ax4.set_title(f'Top 10 Paper Sources ({source_col})')
            ax4.set_xlabel('Number of Papers')
            ax4.set_ylabel('Source')
            st.pyplot(fig4)
            plt.close(fig4)
        else:
            st.info("No sources found for the selected filters.")

        st.subheader("Data Sample")
        st.write(filtered_df.head(10))