pip-ready

oliver-lloyd · Jun 12, 2019 · 45c3298 · 45c3298
1 parent 045631e
commit 45c3298
Show file tree

Hide file tree

Showing 5 changed files with 263 additions and 0 deletions.
diff --git a/setup.cfg b/setup.cfg
@@ -0,0 +1,2 @@
+[metadata]
+description-file = README.md
diff --git a/setup.py b/setup.py
@@ -0,0 +1,28 @@
+from distutils.core import setup
+setup(
+  name = 'twitch-listener',         # How you named your package folder (MyLib)
+  packages = ['twitch-listener'],   # Chose the same as "name"
+  version = '1.0.0',      # Start with a small number and increase it with every change you make
+  license='MIT',        # Chose a license from here: https://help.github.com/articles/licensing-a-repository
+  description = 'Tools to scrape Twitch chat data',   # Give a short description about your library
+  author = 'Oliver Lloyd',                   # Type in your name
+  author_email = 'ollielloyd96@outlook.com',      # Type in your E-Mail
+  url = 'https://github.com/lloyd334/twitch-listener',   # Provide either the link to your github or to your website
+  download_url = 'https://github.com/user/reponame/archive/v_01.tar.gz',    # I explain this later on
+  keywords = ['Twitch', 'chatbot', 'scraper'],   # Keywords that define your package best
+  install_requires=[            # I get to this in a second
+          'validators',
+          'beautifulsoup4',
+      ],
+  classifiers=[
+    'Development Status :: 4 - Beta',      # Chose either "3 - Alpha", "4 - Beta" or "5 - Production/Stable" as the current state of your package
+
+    'Intended Audience :: Developers',      # Define that your audience are developers
+    'Topic :: Software Development :: Build Tools',
+
+    'License :: OSI Approved :: MIT License',   # Again, pick a license
+
+    'Programming Language :: Python :: 3'      #Specify which pyhton versions that you want to support
+
+  ],
+)
diff --git a/twitch_listener/TwitchListener.py b/twitch_listener/TwitchListener.py
@@ -0,0 +1,202 @@
+import pandas as pd
+from socket import socket
+from time import time, sleep
+
+class twitch(socket):
+
+
+    def __init__(self, nickname, oauth, client_id):
+
+        self.nickname = nickname
+
+        self.client_id = client_id
+        if oauth.startswith('oauth:'):
+            self.oauth = oauth
+        else:
+            self.oauth = 'oauth:' + oauth
+
+        # IRC parameters
+        self._server = "irc.chat.twitch.tv"
+        self._port = 6667
+        self._passString = f"PASS " + oauth + f"\n"
+        self._nameString = f"NICK " + nickname + f"\n"
+
+
+    def _join_channels(self, channels):
+
+        self._sockets = {}
+        self.joined = []
+        self._loggers = {}
+
+        # Establish socket connections
+        for channel in channels:
+            if utils.is_live(channel, self.client_id):
+                self._sockets[channel] = socket()
+                self._sockets[channel].connect((self._server, self._port))
+                self._sockets[channel].send(self._passString.encode('utf-8'))
+                self._sockets[channel].send(self._nameString.encode('utf-8'))
+
+                joinString = f"JOIN #" + channel.lower() + f"\n"
+                self._sockets[channel].send(joinString.encode('utf-8'))
+                self._loggers[channel] = utils.setup_loggers(channel, channel + '.log')
+
+                self.joined.append(channel)
+            else:
+                print(channel + " is not live right now.")
+
+
+    def listen(self, channels, duration = 0, debug = True):
+
+        """
+        Method for scraping chat data from Twitch channels.
+
+        Parameters:
+            channels (string or list) 
+                - Channel(s) to connect to.
+            duration (int)           
+                 - Length of time to listen for.
+            debug (bool)             
+                 - Debugging feature, will likely be removed in later version. 
+        """
+
+        if type(channels) is str:
+            channels = [channels]
+        self._join_channels(channels)
+        startTime = time()
+
+        # Collect data while duration not exceeded and channels are live
+        while (time() - startTime) < duration: 
+            now = time() # Track loop time for adaptive rate limiting
+            offline = [] # Track channels that go offline to repeated utils.is_live
+
+            for channel in self.joined:
+                if channel not in offline:
+                    if utils.is_live(channel, self.client_id):
+                        response = self._sockets[channel].recv(16384)
+                        if b"PING :tmi.twitch.tv\r\n" in response:
+                            self._sockets[channel].send("PONG :tmi.twitch.tv\r\n".encode("utf-8"))
+                            if debug:
+                                print("\n\n!!Look, a ping: \n")
+                                print(response)
+                                print("\n\n")
+                        else:
+                            self._loggers[channel].info(response)
+                            if debug:
+                                print(response)
+                        elapsed = time() - now
+                        if elapsed < 60/800:
+                            sleep( (60/800) - elapsed) # Rate limit
+                    else: # If not utils.is_live()
+                        offline.append(channel)
+        if debug:
+            print("Collected for " + str(time()-startTime) + " seconds")
+
+        # Close sockets once not collecting data
+        for channel in self.joined:
+            self._sockets[channel].close()
+
+    def _split_line(self, line, firstLine = False):
+
+        prefix = line[:28]        
+        if firstLine:
+            line = line.split('End of /NAMES list\\r\\n')[1]        
+        splits = [message for ind, message in enumerate(line.split("\\r\\n")) 
+                  if 'PRIVMSG' in message or ind == 0] 
+        for i, case in enumerate(splits):
+            if firstLine or i != 0:
+                splits[i] = prefix + splits[i]
+
+        return splits
+
+    def parse_logs(self, timestamp = True, channels = []):
+
+        """
+        Method for converting raw data from text logs into .CSV format.
+
+        Parameters:
+            timestamp (boolean, optional) 
+                - Whether or not to include the timestamp of chat messages. 
+                - Note: timestamps represent when message 
+                    was retrieved, not sent
+            channels (list, optional)     
+                - List of channel usernames for whom the text logs 
+                    will be parsed into csv format.
+                - If none are specified, the channels that are 
+                    currently joined will be parsed
+        """
+
+        # Check if specific list of channels is given
+        if len(channels) == 0:
+            try:
+                channels = self.joined
+            except:
+                print("Please either connect to channels, \
+                      or specify a list of log files to parse.")
+
+        for channel in channels:
+            filename = channel + ".log"
+            lines = []
+            with open(filename) as f:
+                for line in f:
+                    if line not in lines:
+                        lines.append(line)
+
+            # Separate the raw strings into separate messages 
+            split_messages = []
+            for line in lines:
+                count = line.count('.tmi.twitch.tv PRIVMSG #')
+
+                if 'Your host is tmi.twitch.tv' in line:
+                    if 'PRIVMSG' in line:
+                        msgs = self._split_line(line, firstLine = True)
+                        for msg in msgs:
+                            split_messages.append(msg)
+                    else:      
+                        pass               
+                elif count == 0:
+                    pass
+                elif count == 1:
+                    if line.endswith('\\r\\n\'\n'):
+                        split_messages.append(line[:-6])
+                    else:
+                        split_messages.append(line)     
+                else:
+                    for msg in self._split_line(line):
+                        split_messages.append(msg)
+
+            # Parse username, message text and (optional) datetime
+            data = []          
+            for ind, message in enumerate(split_messages):
+                username = None
+                message_text = None
+                datetime = None
+                row = {}
+
+                # Parse message text
+                hash_channel_point = message.find("PRIVMSG #" + channel)
+                slice_ = message[hash_channel_point:]
+                slice_point = slice_.find(":") + 1
+                message_text = slice_[slice_point:]
+                row['text'] = message_text
+
+                # Parse username
+                b = message.find("b")
+                exclam = message.find("!")
+                username = message[b:exclam][3:]
+                row['username'] = username
+
+                # Parse timestamp 
+                # (note: dates are in weirdo American format)
+                if timestamp:
+                    datetime = message[:23] 
+                    row['timestamp'] = datetime
+
+                # Store observations
+                data.append(row)
+
+            # Write data to file
+            pd.DataFrame(data).to_csv(channel + ".csv", index = False)
+
+
+
+
diff --git a/twitch_listener/__init__.py b/twitch_listener/__init__.py
@@ -0,0 +1,6 @@
+import TwitchListener
+import utils
+
+
+
+
diff --git a/twitch_listener/utils.py b/twitch_listener/utils.py
@@ -0,0 +1,25 @@
+import logging
+import requests
+import json
+
+def setup_loggers(name, log_file, level=logging.INFO):
+        formatter = logging.Formatter('%(asctime)s — %(message)s')
+        handler = logging.FileHandler(log_file)        
+        handler.setFormatter(formatter)
+
+        logger = logging.getLogger(name)
+        logger.setLevel(level)
+        logger.addHandler(handler)
+
+        return logger
+
+def is_live(streamer_name, client_id):
+
+    twitch_api_stream_url = "https://api.twitch.tv/kraken/streams/" \
+                    + streamer_name + "?client_id=" + client_id
+
+    streamer_html = requests.get(twitch_api_stream_url)
+
+    streamer = json.loads(streamer_html.content)
+
+    return streamer["stream"] is not None