Merge branch 'main' of https://github.com/ruitianzhong/go-rpc

ruitianzhong · Jun 1, 2024 · 443d6c4 · 443d6c4
2 parents c682818 + 9f352fc
commit 443d6c4
Show file tree

Hide file tree

Showing 4 changed files with 45 additions and 11 deletions.
diff --git a/map-reduce/README.md b/map-reduce/README.md
@@ -107,4 +107,33 @@ intermediate outputs, which helps to cut down the amount of data transferred fro
 
 ### setOutputKeyClass()
 
-Set the key class for the job output data.
+Set the key class for the job output data.
+
+### Using Hadoop Distributed File System
+
+initialization:
+
+```shell
+bin/hdfs namenode -format
+sbin/start-dfs.sh
+```
+
+```shell
+hadoop fs -mkdir /user
+hadoop fs -mkdir /user/{your user name on Linux}
+hadoop fs -mkdir test_data
+hadoop fs -put ./data/grades.txt test_data
+hadoop fs -put ./data/child-parent.txt test_data
+hadoop fs -ls
+hadoop fs -cat  avg_by_class_output/*
+```
+
+Or vist : http://localhost:9870 directly
+
+It seems that current directory is mapped into fs without further configuration.
+
+set the variable in `.bashrc`
+
+```shell
+export PDSH_RCMD_TYPE=ssh
+```
diff --git a/map-reduce/run_hadoop_mapreduce.sh b/map-reduce/run_hadoop_mapreduce.sh
@@ -1,7 +1,7 @@
 echo "Run Mapreduce Job"
-if [[ $# -eq 0 ]]
+if [[ $# -ne 1 ]]
 then
-  echo "Need argument; exit;"
+  echo "Need 1 argument; exit;"
   exit 1
 fi
 mvn clean
@@ -10,18 +10,24 @@ avg_by_class_output_dir="avg_by_class_output"
 avg_by_student_output_dir="avg_by_student_output"
 relation_output_dir="relation_output"
 JAR_PATH="target/map-reduce-1.0-SNAPSHOT.jar"
+DATA_PREFIX="test_data"
+
 case $1 in
 avg_by_class)
-  rm -rf ${avg_by_class_output_dir}
-  hadoop  jar ${JAR_PATH} ink.zrt.AvgGradeByClass ./data/grades.txt ${avg_by_class_output_dir}
+  hadoop fs -rm -f -r  ${avg_by_class_output_dir}
+  hadoop  jar ${JAR_PATH} ink.zrt.AvgGradeByClass ${DATA_PREFIX}/grades.txt ${avg_by_class_output_dir}
+  hadoop fs -cat ${avg_by_class_output_dir}/* | tee ${avg_by_class_output_dir}
   ;;
 avg_by_student)
-  rm -rf ${avg_by_student_output_dir}
-  hadoop  jar ${JAR_PATH} ink.zrt.AvgGradeByStudent ./data/grades.txt ${avg_by_student_output_dir}
+  hadoop fs -rm -f -r ${avg_by_student_output_dir}
+  hadoop  jar ${JAR_PATH} ink.zrt.AvgGradeByStudent ${DATA_PREFIX}/grades.txt ${avg_by_student_output_dir}
+  hadoop fs -cat ${avg_by_student_output_dir}/*  | tee  ${avg_by_student_output_dir}
+
   ;;
 relation)
-  rm -rf ${relation_output_dir}
-  hadoop  jar ${JAR_PATH} ink.zrt.FindRelation ./data/child-parent.txt ${relation_output_dir}
+  hadoop fs -rm -f -r ${relation_output_dir}
+  hadoop  jar ${JAR_PATH} ink.zrt.FindRelation ${DATA_PREFIX}/child-parent.txt ${relation_output_dir}
+  hadoop fs -cat ${relation_output_dir}/* | tee  ${relation_output_dir}
   ;;
 *)
   echo "$1 does not match any pattern"

diff --git a/map-reduce/src/main/java/ink/zrt/AvgGradeByClass.java b/map-reduce/src/main/java/ink/zrt/AvgGradeByClass.java
@@ -45,6 +45,7 @@ public static void main(String[] args) throws IOException, InterruptedException,
         job.setOutputValueClass(FloatWritable.class);
 
         FileInputFormat.addInputPath(job, new Path(args[0]));
+
         FileOutputFormat.setOutputPath(job, new Path(args[1]));
         System.exit(job.waitForCompletion(true) ? 0 : 1);
     }

diff --git a/spark-python/grade.py b/spark-python/grade.py
@@ -17,8 +17,6 @@
 # It seems that the empty line is ignored
 print("[INFO] total record number is " + str(table.count()))
 
-table.select(sf.avg("grade").alias("Average grade")).groupBy("name")
-
 avg_by_student = table.filter(table.type == "必修") \
     .groupBy("name") \
     .avg("grade") \