analyze_tweets.qmd

---
title: Analysis of tweets
author: Eyayaw Beze
format: 
    html:
        code-tools: true
        embed-resources: false
engine: python3
execute:
    echo: false
    warning: false
    message: false
fig-path: doc
---

```{python}
#| label: imports
import json
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
from datetime import datetime

from utils import validate_date, get_handles
from tidy_tweets import load_tweets
from simple_wordcloud import create_word_cloud
```


```{python}
#| tags: [parameters]
handle = "AbiyAhmedAli"
```
```{python}
#| label: constants
DATE_FORMAT = "%Y-%m-%dT%H:%M:%S.%fZ"
conflict_start_date = datetime.fromisoformat("2020-11-03").date()
```

```{python}

def summary(tweets: list):
    # 1. Tweet Volume and Timeframe
    tweet_count = len(tweets)
    oldest_date = min(
        datetime.strptime(tweet["created_at"], DATE_FORMAT) for tweet in tweets
    )
    newest_date = max(
        datetime.strptime(tweet["created_at"], DATE_FORMAT) for tweet in tweets
    )

    # 2. Engagement Metrics
    total_likes = sum(tweet["public_metrics"]["like_count"] for tweet in tweets)
    total_retweets = sum(tweet["public_metrics"]["retweet_count"] for tweet in tweets)
    total_replies = sum(tweet["public_metrics"]["reply_count"] for tweet in tweets)
    total_quotes = sum(tweet["public_metrics"]["quote_count"] for tweet in tweets)

    # 3. Most Engaging Tweets
    most_liked = max(tweets, key=lambda x: x["public_metrics"]["like_count"])
    most_retweeted = max(tweets, key=lambda x: x["public_metrics"]["retweet_count"])

    # 4. Common Hashtags
    hashtags = [
        hashtag["tag"]
        for tweet in tweets
        if "entities" in tweet
        for hashtag in tweet.get("entities", {}).get("hashtags", [])
    ]
    common_hashtags = Counter(hashtags).most_common(5)

    # 5. Media Usage
    media_count = sum(1 for tweet in tweets if "attachments" in tweet)

    # 6. Interaction Patterns
    mentions_count = sum(
        len(tweet.get("entities", {}).get("mentions", [])) for tweet in tweets
    )

    # Print results
    print(f"1. Tweet Volume: {tweet_count}\n")
    print(f"    Date Range: {oldest_date.date()} to {newest_date.date()}")
    print(f"\n2. Total Engagement:\n")
    print(
        f"    Likes: {total_likes}, Retweets: {total_retweets}, Replies: {total_replies}, Quotes: {total_quotes}\n"
    )
    print(
        f"    Average per tweet - Likes: {total_likes/tweet_count:.2f}, Retweets: {total_retweets/tweet_count:.2f}"
    )
    print(f"\n3. Most Engaging Tweets:\n")
    print(
        f"    Most Liked ({most_liked['public_metrics']['like_count']} likes): [{most_liked['text'][:100]}...](https://twitter.com/{handle}/status/{most_liked['id']})\n"
    )
    print(
        f"    Most Retweeted ({most_retweeted['public_metrics']['retweet_count']} retweets): [{most_retweeted['text'][:100]}...](https://twitter.com/{handle}/status/{most_retweeted['id']})"
    )
    print(f"\n4. Common Hashtags: {common_hashtags}")
    print(f"\n6. Tweets with Media: {media_count}")
    print(f"\n7. Total Mentions: {mentions_count}")
```

```{python}
#| output: asis
#| label: user-info

tweets, user_info = load_tweets(f'data/twitter-data/{handle}.json')

print("```json")
print(json.dumps(user_info, indent=4, ensure_ascii=False))
print("```")
```

## Tweets summary
```{python}
#| output: asis
summary(tweets)
```

## Engagement trends


```{python}
#| label: tidy-data
#| layout-ncol: 2

def daily_stat(var: str, df: pd.DataFrame):
    if var not in df.columns:
        raise KeyError("var cannot be found in the df")
    return df.groupby("date")[var].mean().reset_index()

tweets_df = pd.read_csv(f"./data/twitter-data/tidy/{handle}.csv")
tweets_df["created_at"] = pd.to_datetime(tweets_df["created_at"], format=DATE_FORMAT)
tweets_df["date"] = tweets_df["created_at"].dt.date

vars = ["retweet_count","reply_count","like_count","quote_count","impression_count"]
for var in vars:
    sns.lineplot(x="date", y=var, data=daily_stat(var, tweets_df))
    plt.axvline(x=conflict_start_date, color="red", linestyle="--")
    ax = plt.gca()
    ax.yaxis.set_major_formatter(lambda x, p: f"{x/1000:.0f}k")
    plt.ylabel(var.replace("_", " ").title())
    plt.xticks(rotation=45, ha='right') # Rotate x-axis labels by 45 degrees
    plt.tight_layout() # Adjust layout to prevent label cutoff
    plt.show()
```
[Note: The red line indicates the start of the Northern Ethiopia Conflict, `{python} conflict_start_date.strftime("%b %d, %Y")`.]{style="font-size: 0.8em"}

## Wordcloud of tweets
```{python}
#| label: fig-wordcloud
#| column: page

create_word_cloud(tweets)
```