Merge pull request #12 from zhangzhang10/xgboost-example

Add XGBoost with Arrow optimization example notebook
zwx109473 · Apr 27, 2021 · 8552194 · 8552194
2 parents d9b0a0c + 4f537be
commit 8552194
Show file tree

Hide file tree

Showing 2 changed files with 738 additions and 0 deletions.
diff --git a/xgboost/spylon-hibenchdata-to-parquet.ipynb b/xgboost/spylon-hibenchdata-to-parquet.ipynb
@@ -0,0 +1,206 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%python\n",
+    "import findspark\n",
+    "import os\n",
+    "\n",
+    "findspark.init(\"/home/ubuntu/spark-3.0.0-bin-hadoop2.7\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%init_spark\n",
+    "launcher.num_executors = 16\n",
+    "launcher.executor_cores = 24\n",
+    "launcher.executor_memory = '10g'\n",
+    "launcher.conf.set(\"spark.app.name\", \"Generate hibench parquet\")\n",
+    "launcher.conf.set(\"spark.authenticate\", \"false\")\n",
+    "launcher.conf.set(\"spark.deploy-mode\", \"client\")\n",
+    "launcher.conf.set(\"spark.task.cpus\", \"4\")\n",
+    "launcher.conf.set(\"spark.master\", \"yarn\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import org.apache.spark.rdd.RDD\n",
+    "import org.apache.spark.mllib.regression.LabeledPoint\n",
+    "import org.apache.spark.ml.feature.{LabeledPoint => NewLabeledPoint}\n",
+    "import org.apache.spark.sql.functions._\n",
+    "import org.apache.spark.ml._\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "spark.sparkContext.getConf.getAll.foreach(println)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "println(spark.sparkContext.applicationId)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%python\n",
+    "print(spark.sparkContext.applicationId)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sc.defaultParallelism"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "println(s\"Load and parse the data file\")\n",
+    "val mllibRDD: RDD[LabeledPoint] = spark.sparkContext.objectFile(\"hdfs:///HiBench/XGBoost/Input\")\n",
+    "// Convert to ML LabeledPoint and to DataFrame\n",
+    "val mlRDD: RDD[NewLabeledPoint] = mllibRDD.map { p => NewLabeledPoint(p.label, p.features.asML) }\n",
+    "val data = mlRDD.toDF(\"label\", \"features\").coalesce(384)\n",
+    "\n",
+    "val vecToArray = udf( (xs: linalg.Vector) => xs.toArray )\n",
+    "val dfArr = data.withColumn(\"featuresArr\" , vecToArray($\"features\") )\n",
+    "val feats = Array(\n",
+    "    \"f0\", \"f1\", \"f2\", \"f3\", \"f4\", \"f5\", \"f6\", \"f7\", \"f8\", \"f9\",\n",
+    "    \"f10\", \"f11\", \"f12\", \"f13\", \"f14\", \"f15\", \"f16\", \"f17\", \"f18\", \"f19\",\n",
+    "    \"f20\", \"f21\", \"f22\", \"f23\", \"f24\", \"f25\", \"f26\", \"f27\", \"f28\", \"f29\",\n",
+    "    \"f30\", \"f31\", \"f32\", \"f33\", \"f34\", \"f35\", \"f36\", \"f37\", \"f38\", \"f39\",\n",
+    "    \"f40\", \"f41\", \"f42\", \"f43\", \"f44\", \"f45\", \"f46\", \"f47\", \"f48\", \"f49\"\n",
+    ")\n",
+    "val sqlExpr = feats.zipWithIndex.map{ case (alias, idx) => col(\"featuresArr\").getItem(idx).cast(\"float\").as(alias) }\n",
+    "val ldf = dfArr.select(sqlExpr : _*).withColumn(\"id\", monotonicallyIncreasingId)\n",
+    "val rdf = dfArr.select(col(\"label\")).withColumn(\"id\", monotonicallyIncreasingId)\n",
+    "val df = ldf.join(rdf, \"id\").drop(\"id\")\n",
+    "df.write.mode(\"overwrite\").parquet(\"hdfs:///HiBench600Mx50.dataframe.float.parquet\")\n",
+    "\n",
+    "df.printSchema();\n",
+    "println(data.count())\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%python\n",
+    "\n",
+    "# exit gracefully\n",
+    "spark.stop()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "spylon-kernel",
+   "language": "scala",
+   "name": "spylon-kernel"
+  },
+  "language_info": {
+   "codemirror_mode": "text/x-scala",
+   "file_extension": ".scala",
+   "help_links": [
+    {
+     "text": "MetaKernel Magics",
+     "url": "https://metakernel.readthedocs.io/en/latest/source/README.html"
+    }
+   ],
+   "mimetype": "text/x-scala",
+   "name": "scala",
+   "pygments_lexer": "scala",
+   "version": "0.4.1"
+  },
+  "toc": {
+   "base_numbering": 1,
+   "nav_menu": {},
+   "number_sections": true,
+   "sideBar": true,
+   "skip_h1_title": false,
+   "title_cell": "Table of Contents",
+   "title_sidebar": "Contents",
+   "toc_cell": false,
+   "toc_position": {},
+   "toc_section_display": true,
+   "toc_window_display": false
+  },
+  "varInspector": {
+   "cols": {
+    "lenName": 16,
+    "lenType": 16,
+    "lenVar": 40
+   },
+   "kernels_config": {
+    "python": {
+     "delete_cmd_postfix": "",
+     "delete_cmd_prefix": "del ",
+     "library": "var_list.py",
+     "varRefreshCmd": "print(var_dic_list())"
+    },
+    "r": {
+     "delete_cmd_postfix": ") ",
+     "delete_cmd_prefix": "rm(",
+     "library": "var_list.r",
+     "varRefreshCmd": "cat(var_dic_list()) "
+    }
+   },
+   "types_to_exclude": [
+    "module",
+    "function",
+    "builtin_function_or_method",
+    "instance",
+    "_Feature"
+   ],
+   "window_display": false
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/xgboost/xgboost-example.ipynb b/xgboost/xgboost-example.ipynb