Skip to content

Commit

Permalink
pip-ready
Browse files Browse the repository at this point in the history
  • Loading branch information
oliver-lloyd committed Jun 12, 2019
1 parent 045631e commit 45c3298
Show file tree
Hide file tree
Showing 5 changed files with 263 additions and 0 deletions.
2 changes: 2 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[metadata]
description-file = README.md
28 changes: 28 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
from distutils.core import setup
setup(
name = 'twitch-listener', # How you named your package folder (MyLib)
packages = ['twitch-listener'], # Chose the same as "name"
version = '1.0.0', # Start with a small number and increase it with every change you make
license='MIT', # Chose a license from here: https://help.github.com/articles/licensing-a-repository
description = 'Tools to scrape Twitch chat data', # Give a short description about your library
author = 'Oliver Lloyd', # Type in your name
author_email = 'ollielloyd96@outlook.com', # Type in your E-Mail
url = 'https://github.com/lloyd334/twitch-listener', # Provide either the link to your github or to your website
download_url = 'https://github.com/user/reponame/archive/v_01.tar.gz', # I explain this later on
keywords = ['Twitch', 'chatbot', 'scraper'], # Keywords that define your package best
install_requires=[ # I get to this in a second
'validators',
'beautifulsoup4',
],
classifiers=[
'Development Status :: 4 - Beta', # Chose either "3 - Alpha", "4 - Beta" or "5 - Production/Stable" as the current state of your package

'Intended Audience :: Developers', # Define that your audience are developers
'Topic :: Software Development :: Build Tools',

'License :: OSI Approved :: MIT License', # Again, pick a license

'Programming Language :: Python :: 3' #Specify which pyhton versions that you want to support

],
)
202 changes: 202 additions & 0 deletions twitch_listener/TwitchListener.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,202 @@
import pandas as pd
from socket import socket
from time import time, sleep

class twitch(socket):


def __init__(self, nickname, oauth, client_id):

self.nickname = nickname

self.client_id = client_id
if oauth.startswith('oauth:'):
self.oauth = oauth
else:
self.oauth = 'oauth:' + oauth

# IRC parameters
self._server = "irc.chat.twitch.tv"
self._port = 6667
self._passString = f"PASS " + oauth + f"\n"
self._nameString = f"NICK " + nickname + f"\n"


def _join_channels(self, channels):

self._sockets = {}
self.joined = []
self._loggers = {}

# Establish socket connections
for channel in channels:
if utils.is_live(channel, self.client_id):
self._sockets[channel] = socket()
self._sockets[channel].connect((self._server, self._port))
self._sockets[channel].send(self._passString.encode('utf-8'))
self._sockets[channel].send(self._nameString.encode('utf-8'))

joinString = f"JOIN #" + channel.lower() + f"\n"
self._sockets[channel].send(joinString.encode('utf-8'))
self._loggers[channel] = utils.setup_loggers(channel, channel + '.log')

self.joined.append(channel)
else:
print(channel + " is not live right now.")


def listen(self, channels, duration = 0, debug = True):

"""
Method for scraping chat data from Twitch channels.
Parameters:
channels (string or list)
- Channel(s) to connect to.
duration (int)
- Length of time to listen for.
debug (bool)
- Debugging feature, will likely be removed in later version.
"""

if type(channels) is str:
channels = [channels]
self._join_channels(channels)
startTime = time()

# Collect data while duration not exceeded and channels are live
while (time() - startTime) < duration:
now = time() # Track loop time for adaptive rate limiting
offline = [] # Track channels that go offline to repeated utils.is_live

for channel in self.joined:
if channel not in offline:
if utils.is_live(channel, self.client_id):
response = self._sockets[channel].recv(16384)
if b"PING :tmi.twitch.tv\r\n" in response:
self._sockets[channel].send("PONG :tmi.twitch.tv\r\n".encode("utf-8"))
if debug:
print("\n\n!!Look, a ping: \n")
print(response)
print("\n\n")
else:
self._loggers[channel].info(response)
if debug:
print(response)
elapsed = time() - now
if elapsed < 60/800:
sleep( (60/800) - elapsed) # Rate limit
else: # If not utils.is_live()
offline.append(channel)
if debug:
print("Collected for " + str(time()-startTime) + " seconds")

# Close sockets once not collecting data
for channel in self.joined:
self._sockets[channel].close()

def _split_line(self, line, firstLine = False):

prefix = line[:28]
if firstLine:
line = line.split('End of /NAMES list\\r\\n')[1]
splits = [message for ind, message in enumerate(line.split("\\r\\n"))
if 'PRIVMSG' in message or ind == 0]
for i, case in enumerate(splits):
if firstLine or i != 0:
splits[i] = prefix + splits[i]

return splits

def parse_logs(self, timestamp = True, channels = []):

"""
Method for converting raw data from text logs into .CSV format.
Parameters:
timestamp (boolean, optional)
- Whether or not to include the timestamp of chat messages.
- Note: timestamps represent when message
was retrieved, not sent
channels (list, optional)
- List of channel usernames for whom the text logs
will be parsed into csv format.
- If none are specified, the channels that are
currently joined will be parsed
"""

# Check if specific list of channels is given
if len(channels) == 0:
try:
channels = self.joined
except:
print("Please either connect to channels, \
or specify a list of log files to parse.")

for channel in channels:
filename = channel + ".log"
lines = []
with open(filename) as f:
for line in f:
if line not in lines:
lines.append(line)

# Separate the raw strings into separate messages
split_messages = []
for line in lines:
count = line.count('.tmi.twitch.tv PRIVMSG #')

if 'Your host is tmi.twitch.tv' in line:
if 'PRIVMSG' in line:
msgs = self._split_line(line, firstLine = True)
for msg in msgs:
split_messages.append(msg)
else:
pass
elif count == 0:
pass
elif count == 1:
if line.endswith('\\r\\n\'\n'):
split_messages.append(line[:-6])
else:
split_messages.append(line)
else:
for msg in self._split_line(line):
split_messages.append(msg)

# Parse username, message text and (optional) datetime
data = []
for ind, message in enumerate(split_messages):
username = None
message_text = None
datetime = None
row = {}

# Parse message text
hash_channel_point = message.find("PRIVMSG #" + channel)
slice_ = message[hash_channel_point:]
slice_point = slice_.find(":") + 1
message_text = slice_[slice_point:]
row['text'] = message_text

# Parse username
b = message.find("b")
exclam = message.find("!")
username = message[b:exclam][3:]
row['username'] = username

# Parse timestamp
# (note: dates are in weirdo American format)
if timestamp:
datetime = message[:23]
row['timestamp'] = datetime

# Store observations
data.append(row)

# Write data to file
pd.DataFrame(data).to_csv(channel + ".csv", index = False)




6 changes: 6 additions & 0 deletions twitch_listener/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
import TwitchListener
import utils




25 changes: 25 additions & 0 deletions twitch_listener/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import logging
import requests
import json

def setup_loggers(name, log_file, level=logging.INFO):
formatter = logging.Formatter('%(asctime)s — %(message)s')
handler = logging.FileHandler(log_file)
handler.setFormatter(formatter)

logger = logging.getLogger(name)
logger.setLevel(level)
logger.addHandler(handler)

return logger

def is_live(streamer_name, client_id):

twitch_api_stream_url = "https://api.twitch.tv/kraken/streams/" \
+ streamer_name + "?client_id=" + client_id

streamer_html = requests.get(twitch_api_stream_url)

streamer = json.loads(streamer_html.content)

return streamer["stream"] is not None

0 comments on commit 45c3298

Please sign in to comment.