-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdesafio_engenheiro_dados_spark.py
37 lines (26 loc) · 1.52 KB
/
desafio_engenheiro_dados_spark.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
# import findspark
# findspark.init()
from pyspark.sql import SparkSession
from pyspark.sql.functions import split, regexp_extract, size, to_timestamp, to_date, col, sum
spark = SparkSession.builder.appName('Desafio Engenheiro de Dados Spark').getOrCreate()
file = spark.read.text('files')
split_col = split(file['value'], ' - - ')
split_by_space = split(file['value'], ' ')
df = file.withColumn('host', split_col.getItem(0)) \
.withColumn('timestamp', regexp_extract(split_col.getItem(1), '\[(.*?)\]', 1))
df = df.withColumn('timestamp', to_timestamp(df.timestamp, 'dd/MMM/yyyy:HH:mm:ss')) \
.withColumn('request', regexp_extract(split_col.getItem(1), '\"(.*?)\"', 1)) \
.withColumn('http_code', split_by_space.getItem(size(split_by_space) - 2)) \
.withColumn('bytes', split_by_space.getItem(size(split_by_space) - 1)) \
.drop('value')
hosts_number = df.groupBy("host").count().count()
print('1. Número de hosts unicos: ', hosts_number)
not_found_errors_df = df.filter(df['http_code'] == '404')
not_found_errors = not_found_errors_df.count()
print('2. Total de erros 404: ', not_found_errors)
print('3. Os 5 URLs que mais causaram erro 404: ')
not_found_errors_df.groupBy('host').count().orderBy('count', ascending=False).show(5)
print('4. Quantidade de erros 404 por dia: (mostrando apenas os primeiros 10 dias)')
not_found_errors_df.groupBy(to_date(col('timestamp')).alias('date')).count().orderBy('date').show(10)
bytes_total = df.agg(sum('bytes')).collect()[0][0]
print('5. O total de bytes retornados: ', bytes_total)