-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsynthetic_data.py
113 lines (94 loc) · 3.32 KB
/
synthetic_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import pandas as pd
import random
from datetime import datetime, timedelta
def fecha_aleatoria(dias_atras=30):
hoy = datetime.now()
inicio = hoy - timedelta(days=dias_atras)
random_date = inicio + timedelta(seconds=random.randint(0, int((hoy - inicio).total_seconds())))
return random_date.strftime("%Y-%m-%d %H:%M:%S")
# ================================
# 1. Tabla de Tweets
# ================================
total_users = 15
num_tweets = 10
tweet_categories = ["Política", "Economía", "Deportes", "Cultura", "Tecnología", "Salud", "Mundo"]
tweet_creators = [f"User_{i+1}" for i in range(total_users)]
tweets = []
tweet_id_counter = 1000
for i in range(num_tweets):
tweet = {
"tweet_id": tweet_id_counter,
"creator": random.choice(tweet_creators),
"category": random.choice(tweet_categories),
"hashtags": ", ".join(random.sample(["#news", "#trending", "#update", "#breaking", "#info", "#tech", "#world"], random.choice([0, 1, 2]))),
"tweet_date": fecha_aleatoria(60),
}
tweets.append(tweet)
tweet_id_counter += 1
df_tweets = pd.DataFrame(tweets)
print("Tabla de Tweets:")
print(df_tweets.head())
print("\n")
# ================================
# 2. Tabla de Usuarios
# ================================
countries = ["USA", "España", "México", "Argentina", "Colombia", "Chile", "Perú"]
bios = [
"Apasionado por la tecnología.",
"Amante de la música y el cine.",
"Periodista en busca de la verdad.",
"Emprendedor y soñador.",
"Experto en deportes y fitness.",
"Estudiante de economía.",
"Fotógrafo aficionado."
]
users = []
for i in range(total_users):
user = {
"user_id": f"User_{i+1}",
"username": f"User_{i+1}",
"account_creation_date": fecha_aleatoria(365*5),
"age": random.randint(18, 60),
"country": random.choice(countries),
"bio": random.choice(bios),
"followers_count": random.randint(50, 10000)
}
users.append(user)
df_users = pd.DataFrame(users)
print("Tabla de Usuarios:")
print(df_users.head())
print("\n")
# ================================
# 3. Tabla de Interacciones
# ================================
num_interacciones = 50
interaction_types = ["comment", "like", "repost", "mention"]
interacciones = []
interaction_id_counter = 2000
for i in range(num_interacciones):
tipo = random.choice(interaction_types)
tweet_ref = random.choice(df_tweets["tweet_id"].tolist())
user_from = random.choice(df_users["user_id"].tolist())
if tipo in ["mention"]:
user_to = random.choice(df_users["user_id"].tolist())
elif tipo in ["comment"] and random.choice([0,1]) == 1:
user_to = random.choice(df_users["user_id"].tolist())
else:
user_to = ""
interaccion = {
"interaction_id": interaction_id_counter,
"tweet_id": tweet_ref,
"user_from": user_from,
"user_to": user_to,
"interaction_type": tipo,
"interaction_date": fecha_aleatoria(30),
}
interacciones.append(interaccion)
interaction_id_counter += 1
df_interacciones = pd.DataFrame(interacciones)
print("Tabla de Interacciones:")
print(df_interacciones.head())
print("\n")
df_tweets.to_csv("data/tweets.csv", index=False)
df_users.to_csv("data/users.csv", index=False)
df_interacciones.to_csv("data/interactions.csv", index=False)