spark performance test results on cluster
test 1 #
7416829 rows csv
- 4core local ssd(xps13)
- loading : 1m4s
- sql(groupby, sum, orderby) : 50s
- 4core nfs hdd(xps13)
- loading : 3m13s
- sql(groupby, sum, orderby) : 3m11s
- 4core(nfs hdd, xps13, master) + 2core(local hdd, n54l, slave)
- loading : 46s
- sql(groupby, sum, orderby) : 40s
test 2-1 #
- 4core (desktop)
- mysql sql : 2m12s
- 4core (desktop)
- loading csv : 50s
- sql(groupby, sum, orderby) : 44s
- 4core (desktop,master) + 4core (mal.dev, slave) + 4core (del-dev-mv, slave) : csv파일은 각 host에 복제해둔 상태에서 테스트
- loading csv : 21s
- sql(groupby, sum, orderby) : 24s
test 2-2 #
- 4core (desktop)
- mysql sql : 2m4s
- 4core (desktop)
- loading csv : 50s
- sql(groupby, sum, orderby) : 45s
- 4core (desktop,master) + 4core (mal.dev, slave) + 4core (del-dev-mv, slave) : csv파일은 각 host에 복제해둔 상태에서 테스트
- loading csv : 55s
- sql(groupby, sum, orderby) : ERR
test 2-3 #
- 4core (desktop)
- mysql sql : 2m5s
- 4core (desktop)
- loading csv : 56s
- sql(groupby, sum, orderby) : 45s
- 4core (desktop,master) + 4core (mal.dev, slave) : csv파일은 각 host에 복제해둔 상태에서 테스트
- loading csv : 38s
- sql(groupby, sum, orderby) : 41s
test case #
val df = sqlContext.read.format("com.databricks.spark.csv").option("header", "true").option("inferSchema", "true").load("export.csv") df.registerTempTable("base") val amtByOp = sqlContext.sql("select optionSrl, amountType, sum(amount) amount from base group by optionSrl, amountType order by amount desc") amtByOp.show()
test 3 (nfs) #
- calc03
- load 56 / sql 43
- calc03(master) + calc02
- load 51 / sql ERR
test 4 (spark2.0, nfs) #
- calc03
- load 38 / sql 14
- calc03(master) + calc02 - load 27 / sql 27 with ERROR(OOM)