-
Notifications
You must be signed in to change notification settings - Fork 0
/
vtsoc.py
64 lines (55 loc) · 1.95 KB
/
vtsoc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import re
import unicodedata
import pandas as pd
import requests
from bs4 import BeautifulSoup
from constants import COL_JOURNAL, COL_DEADLINE, COL_PUB_DATE, COL_TOPIC
URL_SOC = "https://vtsociety.org"
JOURNALS = {
"OJVT": "https://vtsociety.org/publication/ieee-ojvt/special-issues",
"TVT": "https://vtsociety.org/publication/transactions-vehicular-technology/call-papers",
"VTM": "https://vtsociety.org/publication/vtmagazine#documents",
}
#RE_DATE = r"Manuscript submission: (.+? \d{1,2}, \d{4})(?:.*)Final publication: (.+)"
RE_DATE = r"(?:.*)Deadline:(\d{1,2} \w+ \d{4})(?:.*)"
def get_all_cfp():
data = []
for journal, url in JOURNALS.items():
data.append(parse_journal_cfp(url, journal))
data = pd.concat(data, ignore_index=True)
# data = translate_data_formats(data)
return data
def parse_journal_cfp(url: str, journal: str):
resp = requests.get(url)
soup = BeautifulSoup(resp.text, "html.parser")
posts = soup("main")[0].find_all("article")
rows = []
for post in posts:
try:
topic_cell = post.find("h3").find("a")
except AttributeError:
break
#return
topic = unicodedata.normalize("NFKD", topic_cell.get_text(strip=True))
try:
url_cfp = f"{topic_cell['href']}"
topic = f'<a href="{url_cfp}">{topic}</a>'
except:
url_cfp = ""
pub_date = "Unknown"
try:
_match = re.match(RE_DATE, post.get_text(strip=True))
due_date = _match.groups()[0]
except:
continue
journal_url = f'<a href="{url}">{journal}</a>'
rows.append([topic, due_date, pub_date, journal_url])
#if not rows:
# raise ValueError("No entries found")
data = pd.DataFrame(
data=rows, columns=[COL_TOPIC, COL_DEADLINE, COL_PUB_DATE, COL_JOURNAL]
)
return data
if __name__ == "__main__":
data = get_all_cfp()
print(data)