Skip to content

Commit

Permalink
Merge pull request #19 from claromes/global-lang
Browse files Browse the repository at this point in the history
Set Cookies and Reorganize output file name
  • Loading branch information
claromes authored Jan 10, 2024
2 parents 0c80fa1 + c7c317c commit c929028
Show file tree
Hide file tree
Showing 5 changed files with 29 additions and 117 deletions.
8 changes: 1 addition & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

[![PyPI](https://img.shields.io/pypi/v/volleystats)](https://pypi.org/project/volleystats/) [![License)](https://img.shields.io/github/license/claromes/volleystats)](https://github.com/claromes/volleystats/blob/main/LICENSE.md)

CLI tool to scrape volleyball statistics from Data Project Web Competition websites (WCM)
Command-line tool to scrape volleyball statistics from Data Project Web Competition websites (WCM)

**This tool is not affiliated with Genius Sports Company**

Expand Down Expand Up @@ -83,12 +83,6 @@ volleystats [--help] --fed FED (--match MATCH | --comp COMP) [--log]
- `fcv`: Cordoba Volleyball Federation
- `fpdv`: Peruvian Volleyball Federation

## Available Page Locales

- pt-BR
- en-GB
- cs-CZ

## Docs

- [Usage examples](https://github.com/claromes/volleystats/blob/main/docs/EXAMPLES.md)
Expand Down
6 changes: 5 additions & 1 deletion volleystats/spiders/competition.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@ def __init__(self, fed_acronym='', competition_id='', **kwargs):

super().__init__(**kwargs)

def start_requests(self):
cookies = {f'CompetitionLangCode{self.fed_acronym}': 'en-GB'}
yield scrapy.Request(self.start_urls[0], cookies=cookies, callback=self.parse)

def parse(self, response):
competition_items = []

Expand Down Expand Up @@ -72,7 +76,7 @@ def parse(self, response):

def closed(spider, reason):
src = 'data/competition_matches.csv'
dst = f'data/{spider.competition_id}-{spider.fed_acronym}-{spider.first_item_date}-{spider.last_item_date}-competition_matches.csv'
dst = f'data/{spider.fed_acronym}-{spider.competition_id}-{spider.first_item_date}-{spider.last_item_date}-competition_matches.csv'

try:
os.rename(src, dst)
Expand Down
32 changes: 14 additions & 18 deletions volleystats/spiders/match.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,25 +10,23 @@ class HomeStatsSpider(scrapy.Spider):

def __init__(self, fed_acronym='', match_id='', **kwargs):
self.start_urls = [f'https://{fed_acronym}-web.dataproject.com/MatchStatistics.aspx?mID={match_id}']
self.fed_acronym = fed_acronym
self.match_id = match_id
match_date = ''
home_team = ''

super().__init__(**kwargs)

def start_requests(self):
cookies = {f'CompetitionLangCode{self.fed_acronym}': 'en-GB'}
yield scrapy.Request(self.start_urls[0], cookies=cookies, callback=self.parse)

def parse(self, response):
match_date_text = response.xpath("normalize-space(//span[@id='Content_Main_LB_DateTime']/text())").get()

ptBR = response.xpath("//*[contains(@class, 'RCB_Culture_pt-BR')]/span/input/@value").get()
enGB = response.xpath("//*[contains(@class, 'RCB_Culture_en-GB')]/span/input/@value").get()
csCZ = response.xpath("//*[contains(@class, 'RCB_Culture_cs-CZ')]/span/input/@value").get()

if ptBR == 'PT':
match_date = parse_ptbr_date(match_date_text)
elif enGB == 'EN':
if enGB == 'EN':
match_date = parse_engb_date(match_date_text)
elif csCZ == 'CZ':
match_date = parse_cscz_date(match_date_text)

home_team_string = response.xpath("normalize-space(//span[@id='Content_Main_LBL_HomeTeam']/text())").get().replace(' ', '-').lower()
home_team = re.sub('[^A-Za-z0-9]+', '-', home_team_string)
Expand Down Expand Up @@ -58,7 +56,7 @@ def parse(self, response):

def closed(spider, reason):
src = 'data/home_stats.csv'
dst = f'data/{spider.match_id}-{spider.match_date}-home-{spider.home_team}.csv'
dst = f'data/{spider.fed_acronym}-{spider.match_id}-{spider.match_date}-home-{spider.home_team}.csv'

try:
os.rename(src, dst)
Expand All @@ -72,25 +70,23 @@ class GuestStatsSpider(scrapy.Spider):

def __init__(self, fed_acronym='', match_id='', **kwargs):
self.start_urls = [f'https://{fed_acronym}-web.dataproject.com/MatchStatistics.aspx?mID={match_id}']
self.fed_acronym = fed_acronym
self.match_id = match_id
match_date = ''
guest_team = ''

super().__init__(**kwargs)

def start_requests(self):
cookies = {f'CompetitionLangCode{self.fed_acronym}': 'en-GB'}
yield scrapy.Request(self.start_urls[0], cookies=cookies, callback=self.parse)

def parse(self, response):
match_date_text = response.xpath("normalize-space(//span[@id='Content_Main_LB_DateTime']/text())").get()

ptBR = response.xpath("//*[contains(@class, 'RCB_Culture_pt-BR')]/span/input/@value").get()
enGB = response.xpath("//*[contains(@class, 'RCB_Culture_en-GB')]/span/input/@value").get()
csCZ = response.xpath("//*[contains(@class, 'RCB_Culture_cs-CZ')]/span/input/@value").get()

if ptBR == 'PT':
match_date = parse_ptbr_date(match_date_text)
elif enGB == 'EN':
if enGB == 'EN':
match_date = parse_engb_date(match_date_text)
elif csCZ == 'CZ':
match_date = parse_cscz_date(match_date_text)

guest_team_string = response.xpath("normalize-space(//span[@id='Content_Main_LBL_GuestTeam']/text())").get().replace(' ', '-').lower()
guest_team = re.sub('[^A-Za-z0-9]+', '-', guest_team_string)
Expand Down Expand Up @@ -120,7 +116,7 @@ def parse(self, response):

def closed(spider, reason):
src = 'data/guest_stats.csv'
dst = f'data/{spider.match_id}-{spider.match_date}-guest-{spider.guest_team}.csv'
dst = f'data/{spider.fed_acronym}-{spider.match_id}-{spider.match_date}-guest-{spider.guest_team}.csv'

try:
os.rename(src, dst)
Expand Down
98 changes: 8 additions & 90 deletions volleystats/utils.py
Original file line number Diff line number Diff line change
@@ -1,56 +1,16 @@
import locale
import re

from datetime import datetime

# '28/10/2022 - 19:30' to 2022-10-28
# '28/10/2022 - 19:30' or '28.10.2022 - 19:30' to 2022-10-28
def parse_short_date(short_date_string):
short_date_obj = datetime.strptime(short_date_string, "%d/%m/%Y - %H:%M")
parsed_short_date = short_date_obj.strftime("%Y-%m-%d")

return parsed_short_date

# pt_BR: 'sábado, 5 de novembro de 2022 - 21:30' to 2022-11-05
def parse_ptbr_date(date_string):
str_1 = date_string.replace(' de ', '-')
str_2 = str_1.replace(' ', '')
str_3 = str_2.split(',', 1)[1]
str_4 = str_3.rsplit('-', 1)[0]
if '/' in short_date_string:
date_format = '%d/%m/%Y'
elif '.' in short_date_string:
date_format = '%d.%m.%Y'

date_re_1 = re.search(r'-(\w+)-', str_4)
date_re_2 = date_re_1.group(1)
short_date_obj = datetime.strptime(short_date_string, f'{date_format} - %H:%M')
parsed_short_date = short_date_obj.strftime('%Y-%m-%d')

if date_re_2 == 'janeiro':
date_re_en = 'january'
elif date_re_2 == 'fevereiro':
date_re_en = 'february'
elif date_re_2 == 'março':
date_re_en = 'march'
elif date_re_2 == 'abril':
date_re_en = 'april'
elif date_re_2 == 'maio':
date_re_en = 'may'
elif date_re_2 == 'junho':
date_re_en = 'june'
elif date_re_2 == 'julho':
date_re_en = 'july'
elif date_re_2 == 'agosto':
date_re_en = 'august'
elif date_re_2 == 'setembro':
date_re_en = 'september'
elif date_re_2 == 'outubro':
date_re_en = 'october'
elif date_re_2 == 'novembro':
date_re_en = 'november'
elif date_re_2 == 'dezembro':
date_re_en = 'december'

var_date_re_2 = f'{re.escape(date_re_2)}-'
parsed_date = re.sub(var_date_re_2, f'{date_re_en}-', str_4)

parsed_ptbr_date = datetime.strptime(parsed_date, '%d-%B-%Y').date()

return parsed_ptbr_date
return parsed_short_date

# en-GB: '05 November 2022 - 21:30' to 2022-11-05
def parse_engb_date(date_string):
Expand All @@ -60,45 +20,3 @@ def parse_engb_date(date_string):
parsed_engb_date = datetime.strptime(str_2, '%d-%B-%Y').date()

return parsed_engb_date

# cs-CZ: 'čtvrtek 26. října 2023 - 14:00' to 2023-10-26
def parse_cscz_date(date_string):
str_1 = date_string.replace(' ', '-')
str_2 = str_1.rsplit('---', 1)[0]
str_3 = str_2.replace('.', '')
str_4 = str_3.split('-', 1)[1]

date_re_1 = re.search(r'-(\w+)-', str_4)
date_re_2 = date_re_1.group(1)

if date_re_2 == 'ledna':
date_re_en = 'January'
elif date_re_2 == 'února':
date_re_en = 'February'
elif date_re_2 == 'března':
date_re_en = 'March'
elif date_re_2 == 'dubna':
date_re_en = 'April'
elif date_re_2 == 'května':
date_re_en = 'May'
elif date_re_2 == 'června':
date_re_en = 'June'
elif date_re_2 == 'července':
date_re_en = 'July'
elif date_re_2 == 'srpna':
date_re_en = 'August'
elif date_re_2 == 'září':
date_re_en = 'September'
elif date_re_2 == 'října':
date_re_en = 'October'
elif date_re_2 == 'listopadu':
date_re_en = 'November'
elif date_re_2 == 'prosince':
date_re_en = 'December'

var_date_re_2 = f'{re.escape(date_re_2)}-'
parsed_date = re.sub(var_date_re_2, f'{date_re_en}-', str_4)

parsed_cscz_date = datetime.strptime(parsed_date, '%d-%B-%Y').date()

return parsed_cscz_date
2 changes: 1 addition & 1 deletion volleystats/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '0.5.1'
__version__ = '0.6'

0 comments on commit c929028

Please sign in to comment.