Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
ruitianzhong committed Jun 1, 2024
2 parents c682818 + 9f352fc commit 443d6c4
Show file tree
Hide file tree
Showing 4 changed files with 45 additions and 11 deletions.
31 changes: 30 additions & 1 deletion map-reduce/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -107,4 +107,33 @@ intermediate outputs, which helps to cut down the amount of data transferred fro

### setOutputKeyClass()

Set the key class for the job output data.
Set the key class for the job output data.

### Using Hadoop Distributed File System

initialization:

```shell
bin/hdfs namenode -format
sbin/start-dfs.sh
```

```shell
hadoop fs -mkdir /user
hadoop fs -mkdir /user/{your user name on Linux}
hadoop fs -mkdir test_data
hadoop fs -put ./data/grades.txt test_data
hadoop fs -put ./data/child-parent.txt test_data
hadoop fs -ls
hadoop fs -cat avg_by_class_output/*
```

Or vist : http://localhost:9870 directly

It seems that current directory is mapped into fs without further configuration.

set the variable in `.bashrc`

```shell
export PDSH_RCMD_TYPE=ssh
```
22 changes: 14 additions & 8 deletions map-reduce/run_hadoop_mapreduce.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
echo "Run Mapreduce Job"
if [[ $# -eq 0 ]]
if [[ $# -ne 1 ]]
then
echo "Need argument; exit;"
echo "Need 1 argument; exit;"
exit 1
fi
mvn clean
Expand All @@ -10,18 +10,24 @@ avg_by_class_output_dir="avg_by_class_output"
avg_by_student_output_dir="avg_by_student_output"
relation_output_dir="relation_output"
JAR_PATH="target/map-reduce-1.0-SNAPSHOT.jar"
DATA_PREFIX="test_data"

case $1 in
avg_by_class)
rm -rf ${avg_by_class_output_dir}
hadoop jar ${JAR_PATH} ink.zrt.AvgGradeByClass ./data/grades.txt ${avg_by_class_output_dir}
hadoop fs -rm -f -r ${avg_by_class_output_dir}
hadoop jar ${JAR_PATH} ink.zrt.AvgGradeByClass ${DATA_PREFIX}/grades.txt ${avg_by_class_output_dir}
hadoop fs -cat ${avg_by_class_output_dir}/* | tee ${avg_by_class_output_dir}
;;
avg_by_student)
rm -rf ${avg_by_student_output_dir}
hadoop jar ${JAR_PATH} ink.zrt.AvgGradeByStudent ./data/grades.txt ${avg_by_student_output_dir}
hadoop fs -rm -f -r ${avg_by_student_output_dir}
hadoop jar ${JAR_PATH} ink.zrt.AvgGradeByStudent ${DATA_PREFIX}/grades.txt ${avg_by_student_output_dir}
hadoop fs -cat ${avg_by_student_output_dir}/* | tee ${avg_by_student_output_dir}

;;
relation)
rm -rf ${relation_output_dir}
hadoop jar ${JAR_PATH} ink.zrt.FindRelation ./data/child-parent.txt ${relation_output_dir}
hadoop fs -rm -f -r ${relation_output_dir}
hadoop jar ${JAR_PATH} ink.zrt.FindRelation ${DATA_PREFIX}/child-parent.txt ${relation_output_dir}
hadoop fs -cat ${relation_output_dir}/* | tee ${relation_output_dir}
;;
*)
echo "$1 does not match any pattern"
Expand Down
1 change: 1 addition & 0 deletions map-reduce/src/main/java/ink/zrt/AvgGradeByClass.java
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ public static void main(String[] args) throws IOException, InterruptedException,
job.setOutputValueClass(FloatWritable.class);

FileInputFormat.addInputPath(job, new Path(args[0]));

FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
Expand Down
2 changes: 0 additions & 2 deletions spark-python/grade.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,6 @@
# It seems that the empty line is ignored
print("[INFO] total record number is " + str(table.count()))

table.select(sf.avg("grade").alias("Average grade")).groupBy("name")

avg_by_student = table.filter(table.type == "必修") \
.groupBy("name") \
.avg("grade") \
Expand Down

0 comments on commit 443d6c4

Please sign in to comment.