diff --git a/snscrape/modules/telegram.py b/snscrape/modules/telegram.py index 11cc19bb..ea7b6556 100644 --- a/snscrape/modules/telegram.py +++ b/snscrape/modules/telegram.py @@ -91,7 +91,7 @@ def _soup_to_items(self, soup, pageUrl, onlyUsername = False): url = rawUrl.replace('//t.me/', '//t.me/s/') date = datetime.datetime.strptime(dateDiv.find('time', datetime = True)['datetime'].replace('-', '', 2).replace(':', ''), '%Y%m%dT%H%M%S%z') if (message := post.find('div', class_ = 'tgme_widget_message_text')): - content = message.text + content = self.get_post_text(message) outlinks = [] for link in post.find_all('a'): if any(x in link.parent.attrs.get('class', []) for x in ('tgme_widget_message_user', 'tgme_widget_message_author')): @@ -143,6 +143,14 @@ def get_items(self): raise snscrape.base.ScraperException(f'Got status code {r.status_code}') soup = bs4.BeautifulSoup(r.text, 'lxml') + @staticmethod + def get_post_text(post) -> str: + result = [] + # Using the features of the BS4 module itself + for s in post.stripped_strings: + result.append(s) + return '\n'.join(result) + def _get_entity(self): kwargs = {} # /channel has a more accurate member count and bigger profile picture @@ -200,4 +208,4 @@ def _cli_setup_parser(cls, subparser): @classmethod def _cli_from_args(cls, args): - return cls._cli_construct(args, args.channel) + return cls._cli_construct(args, args.channel) \ No newline at end of file