-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathapp.py
More file actions
114 lines (98 loc) · 4.29 KB
/
app.py
File metadata and controls
114 lines (98 loc) · 4.29 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import os
# Import the data loading and processing function from analysis.py
# Assuming analysis.py is in the same directory
from analysis import load_and_process_data
# --- Streamlit Application ---
st.set_page_config(layout="wide", page_title="CORD-19 Research Paper Analysis")
st.title("CORD-19 Research Paper Analysis")
st.markdown("""
This application provides an interactive analysis of the CORD-19 (COVID-19 Open Research Dataset) metadata.
Explore trends in publication years, top journals, and common keywords in paper titles.
""")
@st.cache_data # Cache data to avoid reloading on every rerun
def get_data():
return load_and_process_data()
df = get_data()
if df.empty:
st.error("Could not load data. Please ensure 'metadata.csv' is in the project directory and try again.")
else:
st.sidebar.header("Filter Options")
# Year Range Slider
min_year = int(df['year'].min()) if not df.empty else 2000
max_year = int(df['year'].max()) if not df.empty else 2022
year_range = st.sidebar.slider(
"Select Year Range",
min_value=min_year,
max_value=max_year,
value=(min_year, max_year)
)
# Journal Dropdown
all_journals = ["All"] + sorted(df['journal'].unique().tolist())
selected_journal = st.sidebar.selectbox("Select Journal", all_journals)
# Apply filters
filtered_df = df[(df['year'] >= year_range[0]) & (df['year'] <= year_range[1])]
if selected_journal != "All":
filtered_df = filtered_df[filtered_df['journal'] == selected_journal]
st.header("Analysis Results")
if filtered_df.empty:
st.warning("No data available for the selected filters.")
else:
# --- Visualizations ---
st.subheader("Publications Over Time")
papers_by_year = filtered_df['year'].value_counts().sort_index()
fig1, ax1 = plt.subplots(figsize=(10, 5))
sns.lineplot(x=papers_by_year.index, y=papers_by_year.values, ax=ax1)
ax1.set_title('Number of Publications Over Time')
ax1.set_xlabel('Year')
ax1.set_ylabel('Number of Papers')
ax1.grid(True)
st.pyplot(fig1)
plt.close(fig1)
st.subheader("Top Publishing Journals")
top_journals = filtered_df['journal'].value_counts().head(10)
if not top_journals.empty:
fig2, ax2 = plt.subplots(figsize=(10, 6))
sns.barplot(x=top_journals.values, y=top_journals.index, palette='viridis', ax=ax2)
ax2.set_title('Top 10 Publishing Journals')
ax2.set_xlabel('Number of Papers')
ax2.set_ylabel('Journal')
st.pyplot(fig2)
plt.close(fig2)
else:
st.info("No journals found for the selected filters.")
st.subheader("Word Cloud of Paper Titles")
all_titles = ' '.join(filtered_df['title'].dropna().tolist())
if all_titles:
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_titles)
fig3, ax3 = plt.subplots(figsize=(10, 5))
ax3.imshow(wordcloud, interpolation='bilinear')
ax3.axis('off')
ax3.set_title('Word Cloud of Paper Titles')
st.pyplot(fig3)
plt.close(fig3)
else:
st.info("No titles available for word cloud generation with the selected filters.")
st.subheader("Distribution of Paper Counts by Source")
if 'source_x' in filtered_df.columns:
paper_sources = filtered_df['source_x'].value_counts().head(10)
source_col = 'source_x'
else:
paper_sources = filtered_df['journal'].value_counts().head(10) # Fallback
source_col = 'journal'
if not paper_sources.empty:
fig4, ax4 = plt.subplots(figsize=(10, 6))
sns.barplot(x=paper_sources.values, y=paper_sources.index, palette='plasma', ax=ax4)
ax4.set_title(f'Top 10 Paper Sources ({source_col})')
ax4.set_xlabel('Number of Papers')
ax4.set_ylabel('Source')
st.pyplot(fig4)
plt.close(fig4)
else:
st.info("No sources found for the selected filters.")
st.subheader("Data Sample")
st.write(filtered_df.head(10))