-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_job.sh
executable file
·100 lines (86 loc) · 2.7 KB
/
run_job.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
#!/bin/bash
# Check if two arguments are provided
if [ "$#" -lt 4 ] || [ "$#" -gt 5 ]; then
echo "Usage: $0 <ACTION> <EVENT_SUB_TYPES> <SECTION_IDS> <PAGES> [<EXCLUDED_FIELDS>]"
exit 1
fi
# Assign command-line arguments to variables
ACTION="$1"
SUB_TYPES="$2"
SECTION_IDS="$3"
PAGES="$4"
# Assign the optional fourth argument if provided
if [ -n "$5" ]; then
EXCLUDED_FIELDS="$5"
else
EXCLUDED_FIELDS=""
fi
# Define the application name you want to search for
TARGET_APP_NAME="csv_job"
# List all EMR Serverless applications and find the ID of the target application
APP_ID=$(aws emr-serverless list-applications --query "applications[?name=='${TARGET_APP_NAME}'].id" --output text)
# Check if the application ID was found
if [ -z "$APP_ID" ]; then
echo "Error: Application with name '${TARGET_APP_NAME}' not found."
exit 1
fi
echo "Found Application ID: $APP_ID"
# Define the parameters for the job
ENTRY_POINT="s3://analyticsjobs/job.py" # Update with your script path
ROLE_ARN="arn:aws:iam::762438811603:role/service-role/AmazonEMR-ExecutionRole-1731366715097" # Update with your IAM role
LOG_URI="s3://analyticsjobs/logs/" # Update with your log bucket
SPARK_PARAMS="--conf spark.executor.memory=2G --conf spark.executor.cores=2" # Customize as needed
JOB_ID="$(date '+%Y%m%d%H%M%S')-$(LC_ALL=C tr -dc 'a-zA-Z0-9' < /dev/urandom | head -c 5)"
#JOB_ID="1816"
# Construct the JSON blob
cat <<EOT > job-config.json
{
"applicationId": "$APP_ID",
"executionRoleArn": "$ROLE_ARN",
"jobDriver": {
"sparkSubmit": {
"entryPoint": "$ENTRY_POINT",
"entryPointArguments": [
"--bucket_name",
"torus-xapi-prod",
"--chunk_size",
"10000",
"--ignored_student_ids",
"1",
"--sub_types",
"$SUB_TYPES",
"--job_id",
"$JOB_ID",
"--section_ids",
"$SECTION_IDS",
"--page_ids",
"$PAGES",
"--action",
"$ACTION"
EOT
# Conditionally add `--excluded_fields` to `entryPointArguments`
if [ -n "$EXCLUDED_FIELDS" ]; then
cat <<EOT >> job-config.json
,
"--exclude_fields",
"$EXCLUDED_FIELDS"
EOT
fi
# Close the JSON blob
cat <<EOT >> job-config.json
],
"sparkSubmitParameters": "--conf spark.archives=s3://analyticsjobs/dataset.zip#dataset --py-files s3://analyticsjobs/dataset.zip $SPARK_PARAMS"
}
},
"configurationOverrides": {
"monitoringConfiguration": {
"s3MonitoringConfiguration": {
"logUri": "$LOG_URI"
}
}
}
}
EOT
echo "Job configuration JSON created: job-config.json"
# Submit the job (optional step, uncomment if you want to submit the job immediately)
aws emr-serverless start-job-run --cli-input-json file://job-config.json