Skip to content

Commit

Permalink
Merge pull request #12 from zhangzhang10/xgboost-example
Browse files Browse the repository at this point in the history
Add XGBoost with Arrow optimization example notebook
  • Loading branch information
jerrychenhf committed Apr 27, 2021
2 parents d9b0a0c + 4f537be commit 8552194
Show file tree
Hide file tree
Showing 2 changed files with 738 additions and 0 deletions.
206 changes: 206 additions & 0 deletions xgboost/spylon-hibenchdata-to-parquet.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,206 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%python\n",
"import findspark\n",
"import os\n",
"\n",
"findspark.init(\"/home/ubuntu/spark-3.0.0-bin-hadoop2.7\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%init_spark\n",
"launcher.num_executors = 16\n",
"launcher.executor_cores = 24\n",
"launcher.executor_memory = '10g'\n",
"launcher.conf.set(\"spark.app.name\", \"Generate hibench parquet\")\n",
"launcher.conf.set(\"spark.authenticate\", \"false\")\n",
"launcher.conf.set(\"spark.deploy-mode\", \"client\")\n",
"launcher.conf.set(\"spark.task.cpus\", \"4\")\n",
"launcher.conf.set(\"spark.master\", \"yarn\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import org.apache.spark.rdd.RDD\n",
"import org.apache.spark.mllib.regression.LabeledPoint\n",
"import org.apache.spark.ml.feature.{LabeledPoint => NewLabeledPoint}\n",
"import org.apache.spark.sql.functions._\n",
"import org.apache.spark.ml._\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"spark.sparkContext.getConf.getAll.foreach(println)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"println(spark.sparkContext.applicationId)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%python\n",
"print(spark.sparkContext.applicationId)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sc.defaultParallelism"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"println(s\"Load and parse the data file\")\n",
"val mllibRDD: RDD[LabeledPoint] = spark.sparkContext.objectFile(\"hdfs:///HiBench/XGBoost/Input\")\n",
"// Convert to ML LabeledPoint and to DataFrame\n",
"val mlRDD: RDD[NewLabeledPoint] = mllibRDD.map { p => NewLabeledPoint(p.label, p.features.asML) }\n",
"val data = mlRDD.toDF(\"label\", \"features\").coalesce(384)\n",
"\n",
"val vecToArray = udf( (xs: linalg.Vector) => xs.toArray )\n",
"val dfArr = data.withColumn(\"featuresArr\" , vecToArray($\"features\") )\n",
"val feats = Array(\n",
" \"f0\", \"f1\", \"f2\", \"f3\", \"f4\", \"f5\", \"f6\", \"f7\", \"f8\", \"f9\",\n",
" \"f10\", \"f11\", \"f12\", \"f13\", \"f14\", \"f15\", \"f16\", \"f17\", \"f18\", \"f19\",\n",
" \"f20\", \"f21\", \"f22\", \"f23\", \"f24\", \"f25\", \"f26\", \"f27\", \"f28\", \"f29\",\n",
" \"f30\", \"f31\", \"f32\", \"f33\", \"f34\", \"f35\", \"f36\", \"f37\", \"f38\", \"f39\",\n",
" \"f40\", \"f41\", \"f42\", \"f43\", \"f44\", \"f45\", \"f46\", \"f47\", \"f48\", \"f49\"\n",
")\n",
"val sqlExpr = feats.zipWithIndex.map{ case (alias, idx) => col(\"featuresArr\").getItem(idx).cast(\"float\").as(alias) }\n",
"val ldf = dfArr.select(sqlExpr : _*).withColumn(\"id\", monotonicallyIncreasingId)\n",
"val rdf = dfArr.select(col(\"label\")).withColumn(\"id\", monotonicallyIncreasingId)\n",
"val df = ldf.join(rdf, \"id\").drop(\"id\")\n",
"df.write.mode(\"overwrite\").parquet(\"hdfs:///HiBench600Mx50.dataframe.float.parquet\")\n",
"\n",
"df.printSchema();\n",
"println(data.count())\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%python\n",
"\n",
"# exit gracefully\n",
"spark.stop()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "spylon-kernel",
"language": "scala",
"name": "spylon-kernel"
},
"language_info": {
"codemirror_mode": "text/x-scala",
"file_extension": ".scala",
"help_links": [
{
"text": "MetaKernel Magics",
"url": "https://metakernel.readthedocs.io/en/latest/source/README.html"
}
],
"mimetype": "text/x-scala",
"name": "scala",
"pygments_lexer": "scala",
"version": "0.4.1"
},
"toc": {
"base_numbering": 1,
"nav_menu": {},
"number_sections": true,
"sideBar": true,
"skip_h1_title": false,
"title_cell": "Table of Contents",
"title_sidebar": "Contents",
"toc_cell": false,
"toc_position": {},
"toc_section_display": true,
"toc_window_display": false
},
"varInspector": {
"cols": {
"lenName": 16,
"lenType": 16,
"lenVar": 40
},
"kernels_config": {
"python": {
"delete_cmd_postfix": "",
"delete_cmd_prefix": "del ",
"library": "var_list.py",
"varRefreshCmd": "print(var_dic_list())"
},
"r": {
"delete_cmd_postfix": ") ",
"delete_cmd_prefix": "rm(",
"library": "var_list.r",
"varRefreshCmd": "cat(var_dic_list()) "
}
},
"types_to_exclude": [
"module",
"function",
"builtin_function_or_method",
"instance",
"_Feature"
],
"window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 2
}
532 changes: 532 additions & 0 deletions xgboost/xgboost-example.ipynb

Large diffs are not rendered by default.

0 comments on commit 8552194

Please sign in to comment.