diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..224a779 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,2 @@ +[metadata] +description-file = README.md \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..5f5c2aa --- /dev/null +++ b/setup.py @@ -0,0 +1,28 @@ +from distutils.core import setup +setup( + name = 'twitch-listener', # How you named your package folder (MyLib) + packages = ['twitch-listener'], # Chose the same as "name" + version = '1.0.0', # Start with a small number and increase it with every change you make + license='MIT', # Chose a license from here: https://help.github.com/articles/licensing-a-repository + description = 'Tools to scrape Twitch chat data', # Give a short description about your library + author = 'Oliver Lloyd', # Type in your name + author_email = 'ollielloyd96@outlook.com', # Type in your E-Mail + url = 'https://github.com/lloyd334/twitch-listener', # Provide either the link to your github or to your website + download_url = 'https://github.com/user/reponame/archive/v_01.tar.gz', # I explain this later on + keywords = ['Twitch', 'chatbot', 'scraper'], # Keywords that define your package best + install_requires=[ # I get to this in a second + 'validators', + 'beautifulsoup4', + ], + classifiers=[ + 'Development Status :: 4 - Beta', # Chose either "3 - Alpha", "4 - Beta" or "5 - Production/Stable" as the current state of your package + + 'Intended Audience :: Developers', # Define that your audience are developers + 'Topic :: Software Development :: Build Tools', + + 'License :: OSI Approved :: MIT License', # Again, pick a license + + 'Programming Language :: Python :: 3' #Specify which pyhton versions that you want to support + + ], +) \ No newline at end of file diff --git a/twitch_listener/TwitchListener.py b/twitch_listener/TwitchListener.py new file mode 100644 index 0000000..f096e67 --- /dev/null +++ b/twitch_listener/TwitchListener.py @@ -0,0 +1,202 @@ +import pandas as pd +from socket import socket +from time import time, sleep + +class twitch(socket): + + + def __init__(self, nickname, oauth, client_id): + + self.nickname = nickname + + self.client_id = client_id + if oauth.startswith('oauth:'): + self.oauth = oauth + else: + self.oauth = 'oauth:' + oauth + + # IRC parameters + self._server = "irc.chat.twitch.tv" + self._port = 6667 + self._passString = f"PASS " + oauth + f"\n" + self._nameString = f"NICK " + nickname + f"\n" + + + def _join_channels(self, channels): + + self._sockets = {} + self.joined = [] + self._loggers = {} + + # Establish socket connections + for channel in channels: + if utils.is_live(channel, self.client_id): + self._sockets[channel] = socket() + self._sockets[channel].connect((self._server, self._port)) + self._sockets[channel].send(self._passString.encode('utf-8')) + self._sockets[channel].send(self._nameString.encode('utf-8')) + + joinString = f"JOIN #" + channel.lower() + f"\n" + self._sockets[channel].send(joinString.encode('utf-8')) + self._loggers[channel] = utils.setup_loggers(channel, channel + '.log') + + self.joined.append(channel) + else: + print(channel + " is not live right now.") + + + def listen(self, channels, duration = 0, debug = True): + + """ + Method for scraping chat data from Twitch channels. + + Parameters: + channels (string or list) + - Channel(s) to connect to. + duration (int) + - Length of time to listen for. + debug (bool) + - Debugging feature, will likely be removed in later version. + """ + + if type(channels) is str: + channels = [channels] + self._join_channels(channels) + startTime = time() + + # Collect data while duration not exceeded and channels are live + while (time() - startTime) < duration: + now = time() # Track loop time for adaptive rate limiting + offline = [] # Track channels that go offline to repeated utils.is_live + + for channel in self.joined: + if channel not in offline: + if utils.is_live(channel, self.client_id): + response = self._sockets[channel].recv(16384) + if b"PING :tmi.twitch.tv\r\n" in response: + self._sockets[channel].send("PONG :tmi.twitch.tv\r\n".encode("utf-8")) + if debug: + print("\n\n!!Look, a ping: \n") + print(response) + print("\n\n") + else: + self._loggers[channel].info(response) + if debug: + print(response) + elapsed = time() - now + if elapsed < 60/800: + sleep( (60/800) - elapsed) # Rate limit + else: # If not utils.is_live() + offline.append(channel) + if debug: + print("Collected for " + str(time()-startTime) + " seconds") + + # Close sockets once not collecting data + for channel in self.joined: + self._sockets[channel].close() + + def _split_line(self, line, firstLine = False): + + prefix = line[:28] + if firstLine: + line = line.split('End of /NAMES list\\r\\n')[1] + splits = [message for ind, message in enumerate(line.split("\\r\\n")) + if 'PRIVMSG' in message or ind == 0] + for i, case in enumerate(splits): + if firstLine or i != 0: + splits[i] = prefix + splits[i] + + return splits + + def parse_logs(self, timestamp = True, channels = []): + + """ + Method for converting raw data from text logs into .CSV format. + + Parameters: + timestamp (boolean, optional) + - Whether or not to include the timestamp of chat messages. + - Note: timestamps represent when message + was retrieved, not sent + channels (list, optional) + - List of channel usernames for whom the text logs + will be parsed into csv format. + - If none are specified, the channels that are + currently joined will be parsed + """ + + # Check if specific list of channels is given + if len(channels) == 0: + try: + channels = self.joined + except: + print("Please either connect to channels, \ + or specify a list of log files to parse.") + + for channel in channels: + filename = channel + ".log" + lines = [] + with open(filename) as f: + for line in f: + if line not in lines: + lines.append(line) + + # Separate the raw strings into separate messages + split_messages = [] + for line in lines: + count = line.count('.tmi.twitch.tv PRIVMSG #') + + if 'Your host is tmi.twitch.tv' in line: + if 'PRIVMSG' in line: + msgs = self._split_line(line, firstLine = True) + for msg in msgs: + split_messages.append(msg) + else: + pass + elif count == 0: + pass + elif count == 1: + if line.endswith('\\r\\n\'\n'): + split_messages.append(line[:-6]) + else: + split_messages.append(line) + else: + for msg in self._split_line(line): + split_messages.append(msg) + + # Parse username, message text and (optional) datetime + data = [] + for ind, message in enumerate(split_messages): + username = None + message_text = None + datetime = None + row = {} + + # Parse message text + hash_channel_point = message.find("PRIVMSG #" + channel) + slice_ = message[hash_channel_point:] + slice_point = slice_.find(":") + 1 + message_text = slice_[slice_point:] + row['text'] = message_text + + # Parse username + b = message.find("b") + exclam = message.find("!") + username = message[b:exclam][3:] + row['username'] = username + + # Parse timestamp + # (note: dates are in weirdo American format) + if timestamp: + datetime = message[:23] + row['timestamp'] = datetime + + # Store observations + data.append(row) + + # Write data to file + pd.DataFrame(data).to_csv(channel + ".csv", index = False) + + + + diff --git a/twitch_listener/__init__.py b/twitch_listener/__init__.py new file mode 100644 index 0000000..25e32fa --- /dev/null +++ b/twitch_listener/__init__.py @@ -0,0 +1,6 @@ +import TwitchListener +import utils + + + + diff --git a/twitch_listener/utils.py b/twitch_listener/utils.py new file mode 100644 index 0000000..eaa119d --- /dev/null +++ b/twitch_listener/utils.py @@ -0,0 +1,25 @@ +import logging +import requests +import json + +def setup_loggers(name, log_file, level=logging.INFO): + formatter = logging.Formatter('%(asctime)s — %(message)s') + handler = logging.FileHandler(log_file) + handler.setFormatter(formatter) + + logger = logging.getLogger(name) + logger.setLevel(level) + logger.addHandler(handler) + + return logger + +def is_live(streamer_name, client_id): + + twitch_api_stream_url = "https://api.twitch.tv/kraken/streams/" \ + + streamer_name + "?client_id=" + client_id + + streamer_html = requests.get(twitch_api_stream_url) + + streamer = json.loads(streamer_html.content) + + return streamer["stream"] is not None \ No newline at end of file