diff --git a/.github/.keep b/.github/.keep new file mode 100644 index 0000000..e69de29 diff --git a/README.md b/README.md index 948dc85..84b20f4 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,4 @@ +[![Review Assignment Due Date](https://classroom.github.com/assets/deadline-readme-button-22041afd0340ce965d47ae6ef1cefeee28c7c493a6346c4f15d667ab976d596c.svg)](https://classroom.github.com/a/uyodabcP) ## Лабораторная работа: Реализация MapReduce для анализа данных о продажах с ипользованием HADOOP!!! # Цель работы diff --git a/benchmarks/outputs/0.csv_m1_r1.txt b/benchmarks/outputs/0.csv_m1_r1.txt new file mode 100644 index 0000000..5dfb8ec --- /dev/null +++ b/benchmarks/outputs/0.csv_m1_r1.txt @@ -0,0 +1,20 @@ +video games 139430602.46 28043 +clothing 135666372.01 27412 +automotive 132122217.95 26650 +footwear 130416434.24 25786 +music instruments 130099073.68 26011 +groceries 128176561.14 25653 +gardening tools 127180582.16 25046 +jewelry 127163374.18 25301 +beauty products 126475981.06 25373 +baby products 125928841.39 25139 +stationery 124781323.26 25396 +electronics 122490773.21 24853 +furniture 121305602.93 24252 +sports equipment 120668760.77 23943 +pet supplies 119525698.85 23820 +home appliances 119416602.83 23654 +health & wellness 119380159.69 23559 +toys 115830491.33 23043 +books 115715264.64 23400 +office equipment 115295667.33 22841 diff --git a/benchmarks/outputs/0.csv_m2_r1.txt b/benchmarks/outputs/0.csv_m2_r1.txt new file mode 100644 index 0000000..5dfb8ec --- /dev/null +++ b/benchmarks/outputs/0.csv_m2_r1.txt @@ -0,0 +1,20 @@ +video games 139430602.46 28043 +clothing 135666372.01 27412 +automotive 132122217.95 26650 +footwear 130416434.24 25786 +music instruments 130099073.68 26011 +groceries 128176561.14 25653 +gardening tools 127180582.16 25046 +jewelry 127163374.18 25301 +beauty products 126475981.06 25373 +baby products 125928841.39 25139 +stationery 124781323.26 25396 +electronics 122490773.21 24853 +furniture 121305602.93 24252 +sports equipment 120668760.77 23943 +pet supplies 119525698.85 23820 +home appliances 119416602.83 23654 +health & wellness 119380159.69 23559 +toys 115830491.33 23043 +books 115715264.64 23400 +office equipment 115295667.33 22841 diff --git a/benchmarks/outputs/0.csv_m4_r1.txt b/benchmarks/outputs/0.csv_m4_r1.txt new file mode 100644 index 0000000..67e6b6b --- /dev/null +++ b/benchmarks/outputs/0.csv_m4_r1.txt @@ -0,0 +1,20 @@ +video games 139430602.46 28043 +clothing 135666372.01 27412 +automotive 132122217.95 26650 +footwear 130416434.24 25786 +music instruments 130099073.68 26011 +groceries 128176561.14 25653 +gardening tools 127180582.16 25046 +jewelry 127163374.18 25301 +beauty products 126475981.06 25373 +baby products 125928841.39 25139 +stationery 124781323.26 25396 +electronics 122490773.21 24853 +furniture 121305602.93 24252 +sports equipment 120668760.77 23943 +pet supplies 119525698.84 23820 +home appliances 119416602.83 23654 +health & wellness 119380159.69 23559 +toys 115830491.33 23043 +books 115715264.64 23400 +office equipment 115295667.33 22841 diff --git a/benchmarks/outputs/0.csv_m8_r1.txt b/benchmarks/outputs/0.csv_m8_r1.txt new file mode 100644 index 0000000..5dfb8ec --- /dev/null +++ b/benchmarks/outputs/0.csv_m8_r1.txt @@ -0,0 +1,20 @@ +video games 139430602.46 28043 +clothing 135666372.01 27412 +automotive 132122217.95 26650 +footwear 130416434.24 25786 +music instruments 130099073.68 26011 +groceries 128176561.14 25653 +gardening tools 127180582.16 25046 +jewelry 127163374.18 25301 +beauty products 126475981.06 25373 +baby products 125928841.39 25139 +stationery 124781323.26 25396 +electronics 122490773.21 24853 +furniture 121305602.93 24252 +sports equipment 120668760.77 23943 +pet supplies 119525698.85 23820 +home appliances 119416602.83 23654 +health & wellness 119380159.69 23559 +toys 115830491.33 23043 +books 115715264.64 23400 +office equipment 115295667.33 22841 diff --git a/benchmarks/outputs/1.csv_m1_r1.txt b/benchmarks/outputs/1.csv_m1_r1.txt new file mode 100644 index 0000000..a74a4f2 --- /dev/null +++ b/benchmarks/outputs/1.csv_m1_r1.txt @@ -0,0 +1,20 @@ +video games 259923298.50 52461 +electronics 259157635.18 52077 +stationery 254846728.32 50406 +music instruments 254594364.83 50892 +automotive 254029581.28 50763 +jewelry 253367048.81 50510 +beauty products 252910413.06 50458 +pet supplies 252174641.63 50540 +office equipment 251283372.44 50060 +sports equipment 249580033.14 50194 +baby products 249424562.17 49693 +furniture 248987660.38 49580 +clothing 248785345.87 49347 +gardening tools 246012434.46 49523 +groceries 244898375.71 48887 +books 243425758.79 48815 +toys 242682921.58 48790 +home appliances 241639264.60 48846 +health & wellness 241394436.48 48961 +footwear 241300629.56 48890 diff --git a/benchmarks/outputs/1.csv_m2_r1.txt b/benchmarks/outputs/1.csv_m2_r1.txt new file mode 100644 index 0000000..a74a4f2 --- /dev/null +++ b/benchmarks/outputs/1.csv_m2_r1.txt @@ -0,0 +1,20 @@ +video games 259923298.50 52461 +electronics 259157635.18 52077 +stationery 254846728.32 50406 +music instruments 254594364.83 50892 +automotive 254029581.28 50763 +jewelry 253367048.81 50510 +beauty products 252910413.06 50458 +pet supplies 252174641.63 50540 +office equipment 251283372.44 50060 +sports equipment 249580033.14 50194 +baby products 249424562.17 49693 +furniture 248987660.38 49580 +clothing 248785345.87 49347 +gardening tools 246012434.46 49523 +groceries 244898375.71 48887 +books 243425758.79 48815 +toys 242682921.58 48790 +home appliances 241639264.60 48846 +health & wellness 241394436.48 48961 +footwear 241300629.56 48890 diff --git a/benchmarks/outputs/1.csv_m4_r1.txt b/benchmarks/outputs/1.csv_m4_r1.txt new file mode 100644 index 0000000..a74a4f2 --- /dev/null +++ b/benchmarks/outputs/1.csv_m4_r1.txt @@ -0,0 +1,20 @@ +video games 259923298.50 52461 +electronics 259157635.18 52077 +stationery 254846728.32 50406 +music instruments 254594364.83 50892 +automotive 254029581.28 50763 +jewelry 253367048.81 50510 +beauty products 252910413.06 50458 +pet supplies 252174641.63 50540 +office equipment 251283372.44 50060 +sports equipment 249580033.14 50194 +baby products 249424562.17 49693 +furniture 248987660.38 49580 +clothing 248785345.87 49347 +gardening tools 246012434.46 49523 +groceries 244898375.71 48887 +books 243425758.79 48815 +toys 242682921.58 48790 +home appliances 241639264.60 48846 +health & wellness 241394436.48 48961 +footwear 241300629.56 48890 diff --git a/benchmarks/outputs/1.csv_m8_r1.txt b/benchmarks/outputs/1.csv_m8_r1.txt new file mode 100644 index 0000000..a74a4f2 --- /dev/null +++ b/benchmarks/outputs/1.csv_m8_r1.txt @@ -0,0 +1,20 @@ +video games 259923298.50 52461 +electronics 259157635.18 52077 +stationery 254846728.32 50406 +music instruments 254594364.83 50892 +automotive 254029581.28 50763 +jewelry 253367048.81 50510 +beauty products 252910413.06 50458 +pet supplies 252174641.63 50540 +office equipment 251283372.44 50060 +sports equipment 249580033.14 50194 +baby products 249424562.17 49693 +furniture 248987660.38 49580 +clothing 248785345.87 49347 +gardening tools 246012434.46 49523 +groceries 244898375.71 48887 +books 243425758.79 48815 +toys 242682921.58 48790 +home appliances 241639264.60 48846 +health & wellness 241394436.48 48961 +footwear 241300629.56 48890 diff --git a/benchmarks/outputs/2.csv_m1_r1.txt b/benchmarks/outputs/2.csv_m1_r1.txt new file mode 100644 index 0000000..0e005f1 --- /dev/null +++ b/benchmarks/outputs/2.csv_m1_r1.txt @@ -0,0 +1,20 @@ +furniture 384585548.90 76840 +automotive 384490669.48 76821 +video games 383312906.53 76947 +stationery 382390452.33 76119 +clothing 381010652.25 76540 +beauty products 380639461.52 75923 +gardening tools 378827590.57 74934 +electronics 377424465.88 76011 +pet supplies 373751531.97 74261 +baby products 372758809.76 74648 +jewelry 372025759.02 74363 +health & wellness 371017780.49 73796 +home appliances 370879966.81 74210 +toys 370006875.26 73268 +footwear 369532067.77 74133 +office equipment 368926332.33 73881 +music instruments 368840132.06 73516 +sports equipment 368221370.22 73034 +groceries 366356685.55 74175 +books 363867163.00 73219 diff --git a/benchmarks/outputs/2.csv_m2_r1.txt b/benchmarks/outputs/2.csv_m2_r1.txt new file mode 100644 index 0000000..0e005f1 --- /dev/null +++ b/benchmarks/outputs/2.csv_m2_r1.txt @@ -0,0 +1,20 @@ +furniture 384585548.90 76840 +automotive 384490669.48 76821 +video games 383312906.53 76947 +stationery 382390452.33 76119 +clothing 381010652.25 76540 +beauty products 380639461.52 75923 +gardening tools 378827590.57 74934 +electronics 377424465.88 76011 +pet supplies 373751531.97 74261 +baby products 372758809.76 74648 +jewelry 372025759.02 74363 +health & wellness 371017780.49 73796 +home appliances 370879966.81 74210 +toys 370006875.26 73268 +footwear 369532067.77 74133 +office equipment 368926332.33 73881 +music instruments 368840132.06 73516 +sports equipment 368221370.22 73034 +groceries 366356685.55 74175 +books 363867163.00 73219 diff --git a/benchmarks/outputs/2.csv_m4_r1.txt b/benchmarks/outputs/2.csv_m4_r1.txt new file mode 100644 index 0000000..0e922ca --- /dev/null +++ b/benchmarks/outputs/2.csv_m4_r1.txt @@ -0,0 +1,20 @@ +furniture 384585548.90 76840 +automotive 384490669.48 76821 +video games 383312906.53 76947 +stationery 382390452.33 76119 +clothing 381010652.25 76540 +beauty products 380639461.52 75923 +gardening tools 378827590.57 74934 +electronics 377424465.88 76011 +pet supplies 373751531.97 74261 +baby products 372758809.76 74648 +jewelry 372025759.02 74363 +health & wellness 371017780.49 73796 +home appliances 370879966.81 74210 +toys 370006875.26 73268 +footwear 369532067.77 74133 +office equipment 368926332.33 73881 +music instruments 368840132.06 73516 +sports equipment 368221370.21 73034 +groceries 366356685.55 74175 +books 363867163.00 73219 diff --git a/benchmarks/outputs/2.csv_m8_r1.txt b/benchmarks/outputs/2.csv_m8_r1.txt new file mode 100644 index 0000000..aa539a8 --- /dev/null +++ b/benchmarks/outputs/2.csv_m8_r1.txt @@ -0,0 +1,20 @@ +furniture 384585548.90 76840 +automotive 384490669.48 76821 +video games 383312906.53 76947 +stationery 382390452.33 76119 +clothing 381010652.25 76540 +beauty products 380639461.52 75923 +gardening tools 378827590.57 74934 +electronics 377424465.88 76011 +pet supplies 373751531.97 74261 +baby products 372758809.76 74648 +jewelry 372025759.02 74363 +health & wellness 371017780.49 73796 +home appliances 370879966.81 74210 +toys 370006875.25 73268 +footwear 369532067.77 74133 +office equipment 368926332.33 73881 +music instruments 368840132.06 73516 +sports equipment 368221370.22 73034 +groceries 366356685.54 74175 +books 363867163.00 73219 diff --git a/benchmarks/outputs/3.csv_m1_r1.txt b/benchmarks/outputs/3.csv_m1_r1.txt new file mode 100644 index 0000000..374d067 --- /dev/null +++ b/benchmarks/outputs/3.csv_m1_r1.txt @@ -0,0 +1,20 @@ +baby products 512285901.90 101809 +beauty products 507675683.90 101701 +gardening tools 505693257.19 100353 +clothing 505352689.55 101738 +furniture 505293883.16 101313 +books 503362444.94 100086 +footwear 503153799.52 100066 +video games 502441147.22 101002 +automotive 501006821.45 99880 +groceries 500857294.55 100125 +pet supplies 500647164.15 100062 +stationery 500542946.70 101125 +sports equipment 497144558.73 99279 +home appliances 496840924.83 99171 +music instruments 496601427.08 99061 +health & wellness 496044096.09 98891 +toys 495260742.79 98372 +jewelry 493881862.86 99056 +electronics 491998226.22 99342 +office equipment 490119871.84 97846 diff --git a/benchmarks/outputs/3.csv_m2_r1.txt b/benchmarks/outputs/3.csv_m2_r1.txt new file mode 100644 index 0000000..6a0a031 --- /dev/null +++ b/benchmarks/outputs/3.csv_m2_r1.txt @@ -0,0 +1,20 @@ +baby products 512285901.90 101809 +beauty products 507675683.90 101701 +gardening tools 505693257.18 100353 +clothing 505352689.55 101738 +furniture 505293883.16 101313 +books 503362444.94 100086 +footwear 503153799.52 100066 +video games 502441147.22 101002 +automotive 501006821.45 99880 +groceries 500857294.55 100125 +pet supplies 500647164.15 100062 +stationery 500542946.70 101125 +sports equipment 497144558.73 99279 +home appliances 496840924.83 99171 +music instruments 496601427.08 99061 +health & wellness 496044096.09 98891 +toys 495260742.79 98372 +jewelry 493881862.86 99056 +electronics 491998226.22 99342 +office equipment 490119871.84 97846 diff --git a/benchmarks/outputs/3.csv_m4_r1.txt b/benchmarks/outputs/3.csv_m4_r1.txt new file mode 100644 index 0000000..374d067 --- /dev/null +++ b/benchmarks/outputs/3.csv_m4_r1.txt @@ -0,0 +1,20 @@ +baby products 512285901.90 101809 +beauty products 507675683.90 101701 +gardening tools 505693257.19 100353 +clothing 505352689.55 101738 +furniture 505293883.16 101313 +books 503362444.94 100086 +footwear 503153799.52 100066 +video games 502441147.22 101002 +automotive 501006821.45 99880 +groceries 500857294.55 100125 +pet supplies 500647164.15 100062 +stationery 500542946.70 101125 +sports equipment 497144558.73 99279 +home appliances 496840924.83 99171 +music instruments 496601427.08 99061 +health & wellness 496044096.09 98891 +toys 495260742.79 98372 +jewelry 493881862.86 99056 +electronics 491998226.22 99342 +office equipment 490119871.84 97846 diff --git a/benchmarks/outputs/3.csv_m8_r1.txt b/benchmarks/outputs/3.csv_m8_r1.txt new file mode 100644 index 0000000..374d067 --- /dev/null +++ b/benchmarks/outputs/3.csv_m8_r1.txt @@ -0,0 +1,20 @@ +baby products 512285901.90 101809 +beauty products 507675683.90 101701 +gardening tools 505693257.19 100353 +clothing 505352689.55 101738 +furniture 505293883.16 101313 +books 503362444.94 100086 +footwear 503153799.52 100066 +video games 502441147.22 101002 +automotive 501006821.45 99880 +groceries 500857294.55 100125 +pet supplies 500647164.15 100062 +stationery 500542946.70 101125 +sports equipment 497144558.73 99279 +home appliances 496840924.83 99171 +music instruments 496601427.08 99061 +health & wellness 496044096.09 98891 +toys 495260742.79 98372 +jewelry 493881862.86 99056 +electronics 491998226.22 99342 +office equipment 490119871.84 97846 diff --git a/benchmarks/outputs/4.csv_m1_r1.txt b/benchmarks/outputs/4.csv_m1_r1.txt new file mode 100644 index 0000000..d88beda --- /dev/null +++ b/benchmarks/outputs/4.csv_m1_r1.txt @@ -0,0 +1,20 @@ +footwear 639403255.74 127080 +baby products 639211761.60 127226 +beauty products 633703344.66 125850 +video games 630611736.97 126225 +automotive 630502807.28 125799 +music instruments 629086183.96 125911 +gardening tools 628565614.09 126055 +furniture 626281174.31 125195 +pet supplies 626019763.38 125391 +clothing 625294900.94 125459 +office equipment 620446962.13 124466 +toys 620388296.62 124126 +stationery 620388231.10 124563 +electronics 619936891.01 124592 +books 619096767.88 123629 +home appliances 618650159.27 124249 +jewelry 616792080.83 123466 +groceries 615728546.76 123276 +sports equipment 614185839.24 123253 +health & wellness 611481857.12 123163 diff --git a/benchmarks/outputs/4.csv_m2_r1.txt b/benchmarks/outputs/4.csv_m2_r1.txt new file mode 100644 index 0000000..d88beda --- /dev/null +++ b/benchmarks/outputs/4.csv_m2_r1.txt @@ -0,0 +1,20 @@ +footwear 639403255.74 127080 +baby products 639211761.60 127226 +beauty products 633703344.66 125850 +video games 630611736.97 126225 +automotive 630502807.28 125799 +music instruments 629086183.96 125911 +gardening tools 628565614.09 126055 +furniture 626281174.31 125195 +pet supplies 626019763.38 125391 +clothing 625294900.94 125459 +office equipment 620446962.13 124466 +toys 620388296.62 124126 +stationery 620388231.10 124563 +electronics 619936891.01 124592 +books 619096767.88 123629 +home appliances 618650159.27 124249 +jewelry 616792080.83 123466 +groceries 615728546.76 123276 +sports equipment 614185839.24 123253 +health & wellness 611481857.12 123163 diff --git a/benchmarks/outputs/4.csv_m4_r1.txt b/benchmarks/outputs/4.csv_m4_r1.txt new file mode 100644 index 0000000..7c991b5 --- /dev/null +++ b/benchmarks/outputs/4.csv_m4_r1.txt @@ -0,0 +1,20 @@ +footwear 639403255.74 127080 +baby products 639211761.60 127226 +beauty products 633703344.66 125850 +video games 630611736.97 126225 +automotive 630502807.28 125799 +music instruments 629086183.96 125911 +gardening tools 628565614.09 126055 +furniture 626281174.31 125195 +pet supplies 626019763.38 125391 +clothing 625294900.94 125459 +office equipment 620446962.13 124466 +toys 620388296.62 124126 +stationery 620388231.09 124563 +electronics 619936891.02 124592 +books 619096767.88 123629 +home appliances 618650159.27 124249 +jewelry 616792080.83 123466 +groceries 615728546.76 123276 +sports equipment 614185839.25 123253 +health & wellness 611481857.12 123163 diff --git a/benchmarks/outputs/4.csv_m8_r1.txt b/benchmarks/outputs/4.csv_m8_r1.txt new file mode 100644 index 0000000..8a93c87 --- /dev/null +++ b/benchmarks/outputs/4.csv_m8_r1.txt @@ -0,0 +1,20 @@ +footwear 639403255.74 127080 +baby products 639211761.60 127226 +beauty products 633703344.66 125850 +video games 630611736.97 126225 +automotive 630502807.28 125799 +music instruments 629086183.96 125911 +gardening tools 628565614.09 126055 +furniture 626281174.31 125195 +pet supplies 626019763.38 125391 +clothing 625294900.94 125459 +office equipment 620446962.13 124466 +toys 620388296.62 124126 +stationery 620388231.09 124563 +electronics 619936891.01 124592 +books 619096767.88 123629 +home appliances 618650159.27 124249 +jewelry 616792080.83 123466 +groceries 615728546.76 123276 +sports equipment 614185839.24 123253 +health & wellness 611481857.12 123163 diff --git a/benchmarks/outputs/5.csv_m1_r1.txt b/benchmarks/outputs/5.csv_m1_r1.txt new file mode 100644 index 0000000..c2d20bd --- /dev/null +++ b/benchmarks/outputs/5.csv_m1_r1.txt @@ -0,0 +1,20 @@ +clothing 765190140.76 152466 +gardening tools 761246670.52 152062 +music instruments 760851057.08 152670 +automotive 756896426.54 150588 +baby products 755500610.17 151207 +video games 755257841.56 150556 +sports equipment 752812776.04 151074 +jewelry 752061838.24 150679 +stationery 751635389.76 151128 +health & wellness 749861093.31 149068 +beauty products 748980486.09 149824 +office equipment 748715205.11 150094 +pet supplies 747688920.05 148731 +electronics 744981161.93 148793 +toys 744514761.34 149831 +books 741677657.79 147919 +groceries 740393128.25 148366 +home appliances 739505896.73 148735 +furniture 738560499.35 148590 +footwear 737860007.42 147968 diff --git a/benchmarks/outputs/5.csv_m2_r1.txt b/benchmarks/outputs/5.csv_m2_r1.txt new file mode 100644 index 0000000..c2d20bd --- /dev/null +++ b/benchmarks/outputs/5.csv_m2_r1.txt @@ -0,0 +1,20 @@ +clothing 765190140.76 152466 +gardening tools 761246670.52 152062 +music instruments 760851057.08 152670 +automotive 756896426.54 150588 +baby products 755500610.17 151207 +video games 755257841.56 150556 +sports equipment 752812776.04 151074 +jewelry 752061838.24 150679 +stationery 751635389.76 151128 +health & wellness 749861093.31 149068 +beauty products 748980486.09 149824 +office equipment 748715205.11 150094 +pet supplies 747688920.05 148731 +electronics 744981161.93 148793 +toys 744514761.34 149831 +books 741677657.79 147919 +groceries 740393128.25 148366 +home appliances 739505896.73 148735 +furniture 738560499.35 148590 +footwear 737860007.42 147968 diff --git a/benchmarks/outputs/5.csv_m4_r1.txt b/benchmarks/outputs/5.csv_m4_r1.txt new file mode 100644 index 0000000..fcbfa8b --- /dev/null +++ b/benchmarks/outputs/5.csv_m4_r1.txt @@ -0,0 +1,20 @@ +clothing 765190140.76 152466 +gardening tools 761246670.52 152062 +music instruments 760851057.08 152670 +automotive 756896426.54 150588 +baby products 755500610.17 151207 +video games 755257841.56 150556 +sports equipment 752812776.04 151074 +jewelry 752061838.24 150679 +stationery 751635389.76 151128 +health & wellness 749861093.31 149068 +beauty products 748980486.09 149824 +office equipment 748715205.11 150094 +pet supplies 747688920.04 148731 +electronics 744981161.93 148793 +toys 744514761.34 149831 +books 741677657.79 147919 +groceries 740393128.25 148366 +home appliances 739505896.73 148735 +furniture 738560499.35 148590 +footwear 737860007.42 147968 diff --git a/benchmarks/outputs/5.csv_m8_r1.txt b/benchmarks/outputs/5.csv_m8_r1.txt new file mode 100644 index 0000000..ab1345b --- /dev/null +++ b/benchmarks/outputs/5.csv_m8_r1.txt @@ -0,0 +1,20 @@ +clothing 765190140.77 152466 +gardening tools 761246670.52 152062 +music instruments 760851057.08 152670 +automotive 756896426.54 150588 +baby products 755500610.17 151207 +video games 755257841.56 150556 +sports equipment 752812776.04 151074 +jewelry 752061838.24 150679 +stationery 751635389.76 151128 +health & wellness 749861093.31 149068 +beauty products 748980486.09 149824 +office equipment 748715205.11 150094 +pet supplies 747688920.05 148731 +electronics 744981161.93 148793 +toys 744514761.34 149831 +books 741677657.79 147919 +groceries 740393128.25 148366 +home appliances 739505896.73 148735 +furniture 738560499.35 148590 +footwear 737860007.42 147968 diff --git a/benchmarks/outputs/6.csv_m1_r1.txt b/benchmarks/outputs/6.csv_m1_r1.txt new file mode 100644 index 0000000..928b8b4 --- /dev/null +++ b/benchmarks/outputs/6.csv_m1_r1.txt @@ -0,0 +1,20 @@ +clothing 891753714.47 177817 +video games 885102477.86 176843 +baby products 882598840.63 176193 +beauty products 882016991.16 176416 +electronics 881082373.54 176604 +home appliances 880042973.40 176182 +gardening tools 875792722.42 175676 +pet supplies 875326195.93 174763 +books 874860008.97 174419 +automotive 873659129.55 174747 +furniture 872428379.31 174422 +office equipment 872354630.71 174504 +sports equipment 870808672.97 174283 +toys 870701976.61 175017 +health & wellness 870674814.31 174277 +groceries 869403908.71 173921 +music instruments 868033172.83 173895 +jewelry 863256038.45 172945 +footwear 863038076.46 173291 +stationery 858410340.72 171653 diff --git a/benchmarks/outputs/6.csv_m2_r1.txt b/benchmarks/outputs/6.csv_m2_r1.txt new file mode 100644 index 0000000..759bf08 --- /dev/null +++ b/benchmarks/outputs/6.csv_m2_r1.txt @@ -0,0 +1,20 @@ +clothing 891753714.47 177817 +video games 885102477.85 176843 +baby products 882598840.63 176193 +beauty products 882016991.16 176416 +electronics 881082373.54 176604 +home appliances 880042973.40 176182 +gardening tools 875792722.43 175676 +pet supplies 875326195.93 174763 +books 874860008.97 174419 +automotive 873659129.55 174747 +furniture 872428379.31 174422 +office equipment 872354630.71 174504 +sports equipment 870808672.97 174283 +toys 870701976.61 175017 +health & wellness 870674814.31 174277 +groceries 869403908.71 173921 +music instruments 868033172.83 173895 +jewelry 863256038.46 172945 +footwear 863038076.46 173291 +stationery 858410340.72 171653 diff --git a/benchmarks/outputs/6.csv_m4_r1.txt b/benchmarks/outputs/6.csv_m4_r1.txt new file mode 100644 index 0000000..6dfdac9 --- /dev/null +++ b/benchmarks/outputs/6.csv_m4_r1.txt @@ -0,0 +1,20 @@ +clothing 891753714.47 177817 +video games 885102477.86 176843 +baby products 882598840.63 176193 +beauty products 882016991.16 176416 +electronics 881082373.54 176604 +home appliances 880042973.40 176182 +gardening tools 875792722.43 175676 +pet supplies 875326195.93 174763 +books 874860008.97 174419 +automotive 873659129.55 174747 +furniture 872428379.31 174422 +office equipment 872354630.71 174504 +sports equipment 870808672.97 174283 +toys 870701976.61 175017 +health & wellness 870674814.31 174277 +groceries 869403908.71 173921 +music instruments 868033172.83 173895 +jewelry 863256038.45 172945 +footwear 863038076.46 173291 +stationery 858410340.72 171653 diff --git a/benchmarks/outputs/6.csv_m8_r1.txt b/benchmarks/outputs/6.csv_m8_r1.txt new file mode 100644 index 0000000..6dfdac9 --- /dev/null +++ b/benchmarks/outputs/6.csv_m8_r1.txt @@ -0,0 +1,20 @@ +clothing 891753714.47 177817 +video games 885102477.86 176843 +baby products 882598840.63 176193 +beauty products 882016991.16 176416 +electronics 881082373.54 176604 +home appliances 880042973.40 176182 +gardening tools 875792722.43 175676 +pet supplies 875326195.93 174763 +books 874860008.97 174419 +automotive 873659129.55 174747 +furniture 872428379.31 174422 +office equipment 872354630.71 174504 +sports equipment 870808672.97 174283 +toys 870701976.61 175017 +health & wellness 870674814.31 174277 +groceries 869403908.71 173921 +music instruments 868033172.83 173895 +jewelry 863256038.45 172945 +footwear 863038076.46 173291 +stationery 858410340.72 171653 diff --git a/benchmarks/outputs/7.csv_m1_r1.txt b/benchmarks/outputs/7.csv_m1_r1.txt new file mode 100644 index 0000000..97d89e1 --- /dev/null +++ b/benchmarks/outputs/7.csv_m1_r1.txt @@ -0,0 +1,20 @@ +gardening tools 1008561966.34 202192 +clothing 1007248356.12 200708 +home appliances 1006912573.28 200768 +furniture 1006544014.81 200052 +music instruments 1004189054.63 200433 +video games 1004028296.39 201249 +baby products 1003726034.63 201271 +toys 1003067588.60 200294 +beauty products 1001471966.40 200872 +groceries 1001100730.31 201067 +electronics 1000455104.06 200994 +automotive 997153657.20 199714 +office equipment 996422905.49 198678 +sports equipment 995965801.22 199227 +books 995615759.95 199461 +health & wellness 994228655.01 198760 +pet supplies 993607814.43 199156 +stationery 988799500.22 197875 +jewelry 985275668.40 197660 +footwear 980870712.65 197210 diff --git a/benchmarks/outputs/7.csv_m2_r1.txt b/benchmarks/outputs/7.csv_m2_r1.txt new file mode 100644 index 0000000..97d89e1 --- /dev/null +++ b/benchmarks/outputs/7.csv_m2_r1.txt @@ -0,0 +1,20 @@ +gardening tools 1008561966.34 202192 +clothing 1007248356.12 200708 +home appliances 1006912573.28 200768 +furniture 1006544014.81 200052 +music instruments 1004189054.63 200433 +video games 1004028296.39 201249 +baby products 1003726034.63 201271 +toys 1003067588.60 200294 +beauty products 1001471966.40 200872 +groceries 1001100730.31 201067 +electronics 1000455104.06 200994 +automotive 997153657.20 199714 +office equipment 996422905.49 198678 +sports equipment 995965801.22 199227 +books 995615759.95 199461 +health & wellness 994228655.01 198760 +pet supplies 993607814.43 199156 +stationery 988799500.22 197875 +jewelry 985275668.40 197660 +footwear 980870712.65 197210 diff --git a/benchmarks/outputs/7.csv_m4_r1.txt b/benchmarks/outputs/7.csv_m4_r1.txt new file mode 100644 index 0000000..97d89e1 --- /dev/null +++ b/benchmarks/outputs/7.csv_m4_r1.txt @@ -0,0 +1,20 @@ +gardening tools 1008561966.34 202192 +clothing 1007248356.12 200708 +home appliances 1006912573.28 200768 +furniture 1006544014.81 200052 +music instruments 1004189054.63 200433 +video games 1004028296.39 201249 +baby products 1003726034.63 201271 +toys 1003067588.60 200294 +beauty products 1001471966.40 200872 +groceries 1001100730.31 201067 +electronics 1000455104.06 200994 +automotive 997153657.20 199714 +office equipment 996422905.49 198678 +sports equipment 995965801.22 199227 +books 995615759.95 199461 +health & wellness 994228655.01 198760 +pet supplies 993607814.43 199156 +stationery 988799500.22 197875 +jewelry 985275668.40 197660 +footwear 980870712.65 197210 diff --git a/benchmarks/outputs/7.csv_m8_r1.txt b/benchmarks/outputs/7.csv_m8_r1.txt new file mode 100644 index 0000000..97d89e1 --- /dev/null +++ b/benchmarks/outputs/7.csv_m8_r1.txt @@ -0,0 +1,20 @@ +gardening tools 1008561966.34 202192 +clothing 1007248356.12 200708 +home appliances 1006912573.28 200768 +furniture 1006544014.81 200052 +music instruments 1004189054.63 200433 +video games 1004028296.39 201249 +baby products 1003726034.63 201271 +toys 1003067588.60 200294 +beauty products 1001471966.40 200872 +groceries 1001100730.31 201067 +electronics 1000455104.06 200994 +automotive 997153657.20 199714 +office equipment 996422905.49 198678 +sports equipment 995965801.22 199227 +books 995615759.95 199461 +health & wellness 994228655.01 198760 +pet supplies 993607814.43 199156 +stationery 988799500.22 197875 +jewelry 985275668.40 197660 +footwear 980870712.65 197210 diff --git a/benchmarks/outputs/all_m1_r1.txt b/benchmarks/outputs/all_m1_r1.txt new file mode 100644 index 0000000..645c483 --- /dev/null +++ b/benchmarks/outputs/all_m1_r1.txt @@ -0,0 +1,20 @@ +clothing 4560302171.99 911487 +video games 4560108307.50 913326 +baby products 4541435362.25 907186 +beauty products 4533874327.85 906417 +gardening tools 4531880837.74 905841 +automotive 4529861310.74 904962 +music instruments 4512294466.14 902389 +furniture 4503986763.16 900244 +electronics 4497526631.04 903266 +pet supplies 4488741730.38 896724 +stationery 4481794912.39 898265 +home appliances 4473888361.73 895815 +sports equipment 4469387812.34 894287 +groceries 4466915230.97 895470 +footwear 4465574983.36 894424 +jewelry 4463823670.79 893980 +office equipment 4463564947.38 892370 +toys 4462453654.12 892741 +books 4457620825.95 890948 +health & wellness 4454082892.49 890475 diff --git a/benchmarks/outputs/all_m2_r1.txt b/benchmarks/outputs/all_m2_r1.txt new file mode 100644 index 0000000..dc840dc --- /dev/null +++ b/benchmarks/outputs/all_m2_r1.txt @@ -0,0 +1,20 @@ +clothing 4560302171.99 911487 +video games 4560108307.50 913326 +baby products 4541435362.25 907186 +beauty products 4533874327.85 906417 +gardening tools 4531880837.74 905841 +automotive 4529861310.74 904962 +music instruments 4512294466.14 902389 +furniture 4503986763.16 900244 +electronics 4497526631.04 903266 +pet supplies 4488741730.38 896724 +stationery 4481794912.40 898265 +home appliances 4473888361.73 895815 +sports equipment 4469387812.34 894287 +groceries 4466915230.97 895470 +footwear 4465574983.36 894424 +jewelry 4463823670.79 893980 +office equipment 4463564947.38 892370 +toys 4462453654.12 892741 +books 4457620825.95 890948 +health & wellness 4454082892.49 890475 diff --git a/benchmarks/outputs/all_m4_r1.txt b/benchmarks/outputs/all_m4_r1.txt new file mode 100644 index 0000000..645c483 --- /dev/null +++ b/benchmarks/outputs/all_m4_r1.txt @@ -0,0 +1,20 @@ +clothing 4560302171.99 911487 +video games 4560108307.50 913326 +baby products 4541435362.25 907186 +beauty products 4533874327.85 906417 +gardening tools 4531880837.74 905841 +automotive 4529861310.74 904962 +music instruments 4512294466.14 902389 +furniture 4503986763.16 900244 +electronics 4497526631.04 903266 +pet supplies 4488741730.38 896724 +stationery 4481794912.39 898265 +home appliances 4473888361.73 895815 +sports equipment 4469387812.34 894287 +groceries 4466915230.97 895470 +footwear 4465574983.36 894424 +jewelry 4463823670.79 893980 +office equipment 4463564947.38 892370 +toys 4462453654.12 892741 +books 4457620825.95 890948 +health & wellness 4454082892.49 890475 diff --git a/benchmarks/outputs/all_m8_r1.txt b/benchmarks/outputs/all_m8_r1.txt new file mode 100644 index 0000000..645c483 --- /dev/null +++ b/benchmarks/outputs/all_m8_r1.txt @@ -0,0 +1,20 @@ +clothing 4560302171.99 911487 +video games 4560108307.50 913326 +baby products 4541435362.25 907186 +beauty products 4533874327.85 906417 +gardening tools 4531880837.74 905841 +automotive 4529861310.74 904962 +music instruments 4512294466.14 902389 +furniture 4503986763.16 900244 +electronics 4497526631.04 903266 +pet supplies 4488741730.38 896724 +stationery 4481794912.39 898265 +home appliances 4473888361.73 895815 +sports equipment 4469387812.34 894287 +groceries 4466915230.97 895470 +footwear 4465574983.36 894424 +jewelry 4463823670.79 893980 +office equipment 4463564947.38 892370 +toys 4462453654.12 892741 +books 4457620825.95 890948 +health & wellness 4454082892.49 890475 diff --git a/benchmarks/plots/0.csv.png b/benchmarks/plots/0.csv.png new file mode 100644 index 0000000..aed065a Binary files /dev/null and b/benchmarks/plots/0.csv.png differ diff --git a/benchmarks/plots/1.csv.png b/benchmarks/plots/1.csv.png new file mode 100644 index 0000000..5aefe3a Binary files /dev/null and b/benchmarks/plots/1.csv.png differ diff --git a/benchmarks/plots/2.csv.png b/benchmarks/plots/2.csv.png new file mode 100644 index 0000000..30eee88 Binary files /dev/null and b/benchmarks/plots/2.csv.png differ diff --git a/benchmarks/plots/3.csv.png b/benchmarks/plots/3.csv.png new file mode 100644 index 0000000..aae9d8a Binary files /dev/null and b/benchmarks/plots/3.csv.png differ diff --git a/benchmarks/plots/4.csv.png b/benchmarks/plots/4.csv.png new file mode 100644 index 0000000..d295377 Binary files /dev/null and b/benchmarks/plots/4.csv.png differ diff --git a/benchmarks/plots/5.csv.png b/benchmarks/plots/5.csv.png new file mode 100644 index 0000000..c695255 Binary files /dev/null and b/benchmarks/plots/5.csv.png differ diff --git a/benchmarks/plots/6.csv.png b/benchmarks/plots/6.csv.png new file mode 100644 index 0000000..ddfe486 Binary files /dev/null and b/benchmarks/plots/6.csv.png differ diff --git a/benchmarks/plots/7.csv.png b/benchmarks/plots/7.csv.png new file mode 100644 index 0000000..d3d9ca0 Binary files /dev/null and b/benchmarks/plots/7.csv.png differ diff --git a/benchmarks/plots/all.png b/benchmarks/plots/all.png new file mode 100644 index 0000000..613a0c0 Binary files /dev/null and b/benchmarks/plots/all.png differ diff --git a/benchmarks/results.csv b/benchmarks/results.csv new file mode 100644 index 0000000..a760871 --- /dev/null +++ b/benchmarks/results.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea379c8bd6c6ac9b223af3e992c85c349a4e1c7e2870c141e1373c358616f82d +size 66 diff --git a/benchmarks/results_all.csv b/benchmarks/results_all.csv new file mode 100644 index 0000000..d9e4241 --- /dev/null +++ b/benchmarks/results_all.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1e95bb56de55fdda533cf373c13bebf17a8919fefe46dd13b98285e9783fa7ea +size 2313 diff --git a/build.gradle.kts b/build.gradle.kts new file mode 100644 index 0000000..464472e --- /dev/null +++ b/build.gradle.kts @@ -0,0 +1,48 @@ +import org.jetbrains.kotlin.gradle.tasks.KotlinCompile + +plugins { + kotlin("jvm") version "1.9.25" + id("com.github.johnrengelman.shadow") version "8.1.1" + application +} + +group = "org.itmo.lab3" +version = "0.1.0" + +repositories { + mavenCentral() +} + +dependencies { + implementation(kotlin("stdlib")) + implementation("org.apache.hadoop:hadoop-client:3.3.6") + implementation("org.slf4j:slf4j-simple:2.0.13") + testImplementation(kotlin("test")) +} + +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(8)) + } +} + +tasks.withType().configureEach { + kotlinOptions.jvmTarget = "1.8" +} + +tasks.withType().configureEach { + sourceCompatibility = "1.8" + targetCompatibility = "1.8" +} + +tasks.test { + useJUnitPlatform() +} + +tasks.shadowJar { + archiveClassifier.set("all") +} + +application { + mainClass.set("org.itmo.lab3.SalesByCategoryJob") +} diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..80c7694 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,22 @@ +version: "3.7" + +services: + yarn: + image: ${YARN_IMAGE:-mtsrus/hadoop:hadoop2-yarn} + restart: no + container_name: yarn + hostname: yarn + environment: + WITH_JOBHISTORY_SERVER: "true" + ports: + - "9820:9820" # HDFS IPC + - "9870:9870" # HDFS WebHDFS + - "8088:8088" # Yarn UI + - "8042:8042" # NodeManager UI + - "19888:19888" # JobHistory UI + volumes: + - ./conf/hadoop/:/var/hadoop/conf/ + - hadoop-data:/var/hadoop/data/ + +volumes: + hadoop-data: diff --git a/scripts/benchmark_all.sh b/scripts/benchmark_all.sh new file mode 100755 index 0000000..ad61a6f --- /dev/null +++ b/scripts/benchmark_all.sh @@ -0,0 +1,66 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" + +FILE_GROUPS="${FILE_GROUPS:-0.csv=0.csv;1.csv=1.csv;2.csv=2.csv;3.csv=3.csv;4.csv=4.csv;5.csv=5.csv;6.csv=6.csv;7.csv=7.csv;all=0.csv,1.csv,2.csv,3.csv,4.csv,5.csv,6.csv,7.csv}" + +INPUT_BASE="${INPUT_BASE:-/data/input-groups}" +OUTPUT_BASE="${OUTPUT_BASE:-/data/output-bench-groups}" +JAR_PATH="${JAR_PATH:-/tmp/app.jar}" +MAP_THREADS="${MAP_THREADS:-1 2 4 8}" +REDUCERS="${REDUCERS:-1 2 4 8}" +RESULTS_FILE="${RESULTS_FILE:-benchmarks/results_all.csv}" +OUTPUT_LOCAL_DIR="${OUTPUT_LOCAL_DIR:-benchmarks/outputs}" + +mkdir -p "${ROOT_DIR}/benchmarks" +mkdir -p "${ROOT_DIR}/${OUTPUT_LOCAL_DIR}" +echo "dataset,map_threads,reducers,duration_ms" > "${ROOT_DIR}/${RESULTS_FILE}" + +IFS=';' read -r -a GROUP_ENTRIES <<< "${FILE_GROUPS}" + +for entry in "${GROUP_ENTRIES[@]}"; do + label="${entry%%=*}" + files_raw="${entry#*=}" + IFS=',' read -r -a files <<< "${files_raw}" + + INPUT_PATH="${INPUT_BASE}/${label}" + echo "Uploading group '${label}' -> ${INPUT_PATH}" + docker exec yarn hdfs dfs -rm -r -f "${INPUT_PATH}" >/dev/null 2>&1 || true + docker exec yarn hdfs dfs -mkdir -p "${INPUT_PATH}" + + for file in "${files[@]}"; do + host_path="${ROOT_DIR}/${file}" + if [[ ! -f "${host_path}" ]]; then + echo "WARN: ${host_path} not found, skipping." + continue + fi + echo " -> ${file}" + docker cp "${host_path}" yarn:/tmp/"${file}" + docker exec yarn hdfs dfs -put -f /tmp/"${file}" "${INPUT_PATH}"/ + done + + for m in ${MAP_THREADS}; do + for r in ${REDUCERS}; do + OUT_PATH="${OUTPUT_BASE}-${label}-m${m}-r${r}" + echo "Running dataset=${label} m=${m} r=${r} -> ${OUT_PATH}" + docker exec yarn hdfs dfs -rm -r -f "${OUT_PATH}" >/dev/null 2>&1 || true + start_ms=$(( $(date +%s%N) / 1000000 )) + docker exec yarn hadoop jar "${JAR_PATH}" \ + --input "${INPUT_PATH}" \ + --output "${OUT_PATH}" \ + --map-threads "${m}" \ + --reducers "${r}" >/dev/null + end_ms=$(( $(date +%s%N) / 1000000 )) + duration_ms=$((end_ms - start_ms)) + echo "${label},${m},${r},${duration_ms}" | tee -a "${ROOT_DIR}/${RESULTS_FILE}" + + if [[ "${r}" -eq 1 ]]; then + docker exec yarn hdfs dfs -cat "${OUT_PATH}/part-r-00000" > "${ROOT_DIR}/${OUTPUT_LOCAL_DIR}/${label}_m${m}_r${r}.txt" + fi + done + done +done + + + diff --git a/settings.gradle.kts b/settings.gradle.kts new file mode 100644 index 0000000..e69de29 diff --git a/src/main/kotlin/org/itmo/lab3/CategoryRevenueMapper.kt b/src/main/kotlin/org/itmo/lab3/CategoryRevenueMapper.kt new file mode 100644 index 0000000..a923ab9 --- /dev/null +++ b/src/main/kotlin/org/itmo/lab3/CategoryRevenueMapper.kt @@ -0,0 +1,24 @@ +package org.itmo.lab3 + +import org.apache.hadoop.io.LongWritable +import org.apache.hadoop.io.Text +import org.apache.hadoop.mapreduce.Mapper + +class CategoryRevenueMapper : Mapper() { + private val reusableKey = Text() + + override fun map(key: LongWritable, value: Text, context: Context) { + val line = value.toString().trim() + if (line.isEmpty() || line.startsWith("transaction_id", ignoreCase = true)) return + + val parts = line.split(',') + if (parts.size < 5) return + + val category = parts[2].trim() + val price = parts[3].toDoubleOrNull() ?: return + val quantity = parts[4].toLongOrNull() ?: return + + reusableKey.set(category) + context.write(reusableKey, CategoryStatsWritable(price * quantity, quantity)) + } +} diff --git a/src/main/kotlin/org/itmo/lab3/CategoryRevenueReducer.kt b/src/main/kotlin/org/itmo/lab3/CategoryRevenueReducer.kt new file mode 100644 index 0000000..aa050fd --- /dev/null +++ b/src/main/kotlin/org/itmo/lab3/CategoryRevenueReducer.kt @@ -0,0 +1,30 @@ +package org.itmo.lab3 + +import java.util.Locale +import org.apache.hadoop.io.Text +import org.apache.hadoop.mapreduce.Reducer + +class CategoryRevenueReducer : Reducer() { + private val aggregated = mutableMapOf() + private val outValue = Text() + + override fun reduce(key: Text, values: Iterable, context: Context) { + val revenue = values.sumOf { it.revenue } + val quantity = values.sumOf { it.quantity } + + aggregated[key.toString()] = CategoryStatsWritable(revenue, quantity) + } + + override fun cleanup(context: Context) { + val sorted = aggregated.entries.sortedWith( + compareByDescending> { it.value.revenue } + .thenBy { it.key }, + ) + + for ((category, stats) in sorted) { + val revenueStr = String.format(Locale.US, "%.2f", stats.revenue) + outValue.set("$revenueStr\t${stats.quantity}") + context.write(Text(category), outValue) + } + } +} diff --git a/src/main/kotlin/org/itmo/lab3/CategoryStatsWritable.kt b/src/main/kotlin/org/itmo/lab3/CategoryStatsWritable.kt new file mode 100644 index 0000000..e1093e1 --- /dev/null +++ b/src/main/kotlin/org/itmo/lab3/CategoryStatsWritable.kt @@ -0,0 +1,23 @@ +package org.itmo.lab3 + +import java.io.DataInput +import java.io.DataOutput +import org.apache.hadoop.io.Writable + +class CategoryStatsWritable( + var revenue: Double = 0.0, + var quantity: Long = 0L, +) : Writable { + + override fun write(out: DataOutput) { + out.writeDouble(revenue) + out.writeLong(quantity) + } + + override fun readFields(input: DataInput) { + revenue = input.readDouble() + quantity = input.readLong() + } +} + + diff --git a/src/main/kotlin/org/itmo/lab3/CliArgs.kt b/src/main/kotlin/org/itmo/lab3/CliArgs.kt new file mode 100644 index 0000000..65d9371 --- /dev/null +++ b/src/main/kotlin/org/itmo/lab3/CliArgs.kt @@ -0,0 +1,10 @@ +package org.itmo.lab3 + +data class CliArgs( + val inputPath: String, + val outputPath: String, + val mapThreads: Int = 4, + val reducers: Int = 1, +) + + diff --git a/src/main/kotlin/org/itmo/lab3/SalesByCategoryJob.kt b/src/main/kotlin/org/itmo/lab3/SalesByCategoryJob.kt new file mode 100644 index 0000000..f40cb21 --- /dev/null +++ b/src/main/kotlin/org/itmo/lab3/SalesByCategoryJob.kt @@ -0,0 +1,78 @@ +package org.itmo.lab3 + +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.FileSystem +import org.apache.hadoop.fs.Path +import org.apache.hadoop.io.Text +import org.apache.hadoop.mapreduce.Job +import org.apache.hadoop.mapreduce.lib.input.FileInputFormat +import org.apache.hadoop.mapreduce.lib.input.TextInputFormat +import org.apache.hadoop.mapreduce.lib.map.MultithreadedMapper +import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat +import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat + +object SalesByCategoryJob { + @JvmStatic + fun main(rawArgs: Array) { + val args = parseArgs(rawArgs) + val configuration = Configuration() + val job = Job.getInstance(configuration, "foobar") + + job.setJarByClass(SalesByCategoryJob::class.java) + + job.mapperClass = MultithreadedMapper::class.java + MultithreadedMapper.setMapperClass(job, CategoryRevenueMapper::class.java) + MultithreadedMapper.setNumberOfThreads(job, args.mapThreads) + + job.reducerClass = CategoryRevenueReducer::class.java + job.numReduceTasks = args.reducers + + job.mapOutputKeyClass = Text::class.java + job.mapOutputValueClass = CategoryStatsWritable::class.java + + job.outputKeyClass = Text::class.java + job.outputValueClass = Text::class.java + + job.inputFormatClass = TextInputFormat::class.java + job.outputFormatClass = TextOutputFormat::class.java + + FileInputFormat.addInputPath(job, Path(args.inputPath)) + val outputPath = Path(args.outputPath) + FileSystem.get(configuration).use { fs -> + if (fs.exists(outputPath)) { + fs.delete(outputPath, true) + } + } + FileOutputFormat.setOutputPath(job, outputPath) + + job.waitForCompletion(true) + } + + private fun parseArgs(args: Array): CliArgs { + var inputPath: String? = null + var outputPath: String? = null + var mapThreads = 4 + var reducers = 1 + + var index = 0 + while (index < args.size) { + when (args[index]) { + "--input" -> inputPath = args.getOrNull(++index) + "--output" -> outputPath = args.getOrNull(++index) + "--map-threads" -> mapThreads = args.getOrNull(++index)?.toIntOrNull() ?: mapThreads + "--reducers" -> reducers = args.getOrNull(++index)?.toIntOrNull() ?: reducers + else -> throw IllegalArgumentException("Unknown argument: ${args[index]}") + } + index++ + } + + return CliArgs( + inputPath = inputPath ?: "", + outputPath = outputPath ?: "", + mapThreads = if (mapThreads < 1) 1 else mapThreads, + reducers = if (reducers < 1) 1 else reducers, + ) + } +} + +