-
Notifications
You must be signed in to change notification settings - Fork 11
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
045631e
commit 45c3298
Showing
5 changed files
with
263 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
[metadata] | ||
description-file = README.md |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
from distutils.core import setup | ||
setup( | ||
name = 'twitch-listener', # How you named your package folder (MyLib) | ||
packages = ['twitch-listener'], # Chose the same as "name" | ||
version = '1.0.0', # Start with a small number and increase it with every change you make | ||
license='MIT', # Chose a license from here: https://help.github.com/articles/licensing-a-repository | ||
description = 'Tools to scrape Twitch chat data', # Give a short description about your library | ||
author = 'Oliver Lloyd', # Type in your name | ||
author_email = 'ollielloyd96@outlook.com', # Type in your E-Mail | ||
url = 'https://github.com/lloyd334/twitch-listener', # Provide either the link to your github or to your website | ||
download_url = 'https://github.com/user/reponame/archive/v_01.tar.gz', # I explain this later on | ||
keywords = ['Twitch', 'chatbot', 'scraper'], # Keywords that define your package best | ||
install_requires=[ # I get to this in a second | ||
'validators', | ||
'beautifulsoup4', | ||
], | ||
classifiers=[ | ||
'Development Status :: 4 - Beta', # Chose either "3 - Alpha", "4 - Beta" or "5 - Production/Stable" as the current state of your package | ||
|
||
'Intended Audience :: Developers', # Define that your audience are developers | ||
'Topic :: Software Development :: Build Tools', | ||
|
||
'License :: OSI Approved :: MIT License', # Again, pick a license | ||
|
||
'Programming Language :: Python :: 3' #Specify which pyhton versions that you want to support | ||
|
||
], | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,202 @@ | ||
import pandas as pd | ||
from socket import socket | ||
from time import time, sleep | ||
|
||
class twitch(socket): | ||
|
||
|
||
def __init__(self, nickname, oauth, client_id): | ||
|
||
self.nickname = nickname | ||
|
||
self.client_id = client_id | ||
if oauth.startswith('oauth:'): | ||
self.oauth = oauth | ||
else: | ||
self.oauth = 'oauth:' + oauth | ||
|
||
# IRC parameters | ||
self._server = "irc.chat.twitch.tv" | ||
self._port = 6667 | ||
self._passString = f"PASS " + oauth + f"\n" | ||
self._nameString = f"NICK " + nickname + f"\n" | ||
|
||
|
||
def _join_channels(self, channels): | ||
|
||
self._sockets = {} | ||
self.joined = [] | ||
self._loggers = {} | ||
|
||
# Establish socket connections | ||
for channel in channels: | ||
if utils.is_live(channel, self.client_id): | ||
self._sockets[channel] = socket() | ||
self._sockets[channel].connect((self._server, self._port)) | ||
self._sockets[channel].send(self._passString.encode('utf-8')) | ||
self._sockets[channel].send(self._nameString.encode('utf-8')) | ||
|
||
joinString = f"JOIN #" + channel.lower() + f"\n" | ||
self._sockets[channel].send(joinString.encode('utf-8')) | ||
self._loggers[channel] = utils.setup_loggers(channel, channel + '.log') | ||
|
||
self.joined.append(channel) | ||
else: | ||
print(channel + " is not live right now.") | ||
|
||
|
||
def listen(self, channels, duration = 0, debug = True): | ||
|
||
""" | ||
Method for scraping chat data from Twitch channels. | ||
Parameters: | ||
channels (string or list) | ||
- Channel(s) to connect to. | ||
duration (int) | ||
- Length of time to listen for. | ||
debug (bool) | ||
- Debugging feature, will likely be removed in later version. | ||
""" | ||
|
||
if type(channels) is str: | ||
channels = [channels] | ||
self._join_channels(channels) | ||
startTime = time() | ||
|
||
# Collect data while duration not exceeded and channels are live | ||
while (time() - startTime) < duration: | ||
now = time() # Track loop time for adaptive rate limiting | ||
offline = [] # Track channels that go offline to repeated utils.is_live | ||
|
||
for channel in self.joined: | ||
if channel not in offline: | ||
if utils.is_live(channel, self.client_id): | ||
response = self._sockets[channel].recv(16384) | ||
if b"PING :tmi.twitch.tv\r\n" in response: | ||
self._sockets[channel].send("PONG :tmi.twitch.tv\r\n".encode("utf-8")) | ||
if debug: | ||
print("\n\n!!Look, a ping: \n") | ||
print(response) | ||
print("\n\n") | ||
else: | ||
self._loggers[channel].info(response) | ||
if debug: | ||
print(response) | ||
elapsed = time() - now | ||
if elapsed < 60/800: | ||
sleep( (60/800) - elapsed) # Rate limit | ||
else: # If not utils.is_live() | ||
offline.append(channel) | ||
if debug: | ||
print("Collected for " + str(time()-startTime) + " seconds") | ||
|
||
# Close sockets once not collecting data | ||
for channel in self.joined: | ||
self._sockets[channel].close() | ||
|
||
def _split_line(self, line, firstLine = False): | ||
|
||
prefix = line[:28] | ||
if firstLine: | ||
line = line.split('End of /NAMES list\\r\\n')[1] | ||
splits = [message for ind, message in enumerate(line.split("\\r\\n")) | ||
if 'PRIVMSG' in message or ind == 0] | ||
for i, case in enumerate(splits): | ||
if firstLine or i != 0: | ||
splits[i] = prefix + splits[i] | ||
|
||
return splits | ||
|
||
def parse_logs(self, timestamp = True, channels = []): | ||
|
||
""" | ||
Method for converting raw data from text logs into .CSV format. | ||
Parameters: | ||
timestamp (boolean, optional) | ||
- Whether or not to include the timestamp of chat messages. | ||
- Note: timestamps represent when message | ||
was retrieved, not sent | ||
channels (list, optional) | ||
- List of channel usernames for whom the text logs | ||
will be parsed into csv format. | ||
- If none are specified, the channels that are | ||
currently joined will be parsed | ||
""" | ||
|
||
# Check if specific list of channels is given | ||
if len(channels) == 0: | ||
try: | ||
channels = self.joined | ||
except: | ||
print("Please either connect to channels, \ | ||
or specify a list of log files to parse.") | ||
|
||
for channel in channels: | ||
filename = channel + ".log" | ||
lines = [] | ||
with open(filename) as f: | ||
for line in f: | ||
if line not in lines: | ||
lines.append(line) | ||
|
||
# Separate the raw strings into separate messages | ||
split_messages = [] | ||
for line in lines: | ||
count = line.count('.tmi.twitch.tv PRIVMSG #') | ||
|
||
if 'Your host is tmi.twitch.tv' in line: | ||
if 'PRIVMSG' in line: | ||
msgs = self._split_line(line, firstLine = True) | ||
for msg in msgs: | ||
split_messages.append(msg) | ||
else: | ||
pass | ||
elif count == 0: | ||
pass | ||
elif count == 1: | ||
if line.endswith('\\r\\n\'\n'): | ||
split_messages.append(line[:-6]) | ||
else: | ||
split_messages.append(line) | ||
else: | ||
for msg in self._split_line(line): | ||
split_messages.append(msg) | ||
|
||
# Parse username, message text and (optional) datetime | ||
data = [] | ||
for ind, message in enumerate(split_messages): | ||
username = None | ||
message_text = None | ||
datetime = None | ||
row = {} | ||
|
||
# Parse message text | ||
hash_channel_point = message.find("PRIVMSG #" + channel) | ||
slice_ = message[hash_channel_point:] | ||
slice_point = slice_.find(":") + 1 | ||
message_text = slice_[slice_point:] | ||
row['text'] = message_text | ||
|
||
# Parse username | ||
b = message.find("b") | ||
exclam = message.find("!") | ||
username = message[b:exclam][3:] | ||
row['username'] = username | ||
|
||
# Parse timestamp | ||
# (note: dates are in weirdo American format) | ||
if timestamp: | ||
datetime = message[:23] | ||
row['timestamp'] = datetime | ||
|
||
# Store observations | ||
data.append(row) | ||
|
||
# Write data to file | ||
pd.DataFrame(data).to_csv(channel + ".csv", index = False) | ||
|
||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
import TwitchListener | ||
import utils | ||
|
||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
import logging | ||
import requests | ||
import json | ||
|
||
def setup_loggers(name, log_file, level=logging.INFO): | ||
formatter = logging.Formatter('%(asctime)s — %(message)s') | ||
handler = logging.FileHandler(log_file) | ||
handler.setFormatter(formatter) | ||
|
||
logger = logging.getLogger(name) | ||
logger.setLevel(level) | ||
logger.addHandler(handler) | ||
|
||
return logger | ||
|
||
def is_live(streamer_name, client_id): | ||
|
||
twitch_api_stream_url = "https://api.twitch.tv/kraken/streams/" \ | ||
+ streamer_name + "?client_id=" + client_id | ||
|
||
streamer_html = requests.get(twitch_api_stream_url) | ||
|
||
streamer = json.loads(streamer_html.content) | ||
|
||
return streamer["stream"] is not None |