forked from susanli2016/PySpark-and-MLlib
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
06cf8d7
commit 7fc6a21
Showing
1 changed file
with
275 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,275 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"+-----+---------+-----------+-----------------+------------------+---------------------+----------------+\n", | ||
"| area|perimeter|compactness| length_of_kernel| width_of_kernel|asymmetry_coefficient|length_of_groove|\n", | ||
"+-----+---------+-----------+-----------------+------------------+---------------------+----------------+\n", | ||
"|15.26| 14.84| 0.871| 5.763| 3.312| 2.221| 5.22|\n", | ||
"|14.88| 14.57| 0.8811|5.553999999999999| 3.333| 1.018| 4.956|\n", | ||
"|14.29| 14.09| 0.905| 5.291|3.3369999999999997| 2.699| 4.825|\n", | ||
"+-----+---------+-----------+-----------------+------------------+---------------------+----------------+\n", | ||
"only showing top 3 rows\n", | ||
"\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"from pyspark.sql import SparkSession\n", | ||
"\n", | ||
"spark = SparkSession.builder.appName('cluster').getOrCreate()\n", | ||
"df = spark.read.csv('seeds_dataset.csv', inferSchema=True, header=True)\n", | ||
"df.show(3)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"root\n", | ||
" |-- area: double (nullable = true)\n", | ||
" |-- perimeter: double (nullable = true)\n", | ||
" |-- compactness: double (nullable = true)\n", | ||
" |-- length_of_kernel: double (nullable = true)\n", | ||
" |-- width_of_kernel: double (nullable = true)\n", | ||
" |-- asymmetry_coefficient: double (nullable = true)\n", | ||
" |-- length_of_groove: double (nullable = true)\n", | ||
"\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"df.printSchema()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 3, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"+-----+---------+-----------+-----------------+------------------+---------------------+----------------+--------------------+\n", | ||
"| area|perimeter|compactness| length_of_kernel| width_of_kernel|asymmetry_coefficient|length_of_groove| features|\n", | ||
"+-----+---------+-----------+-----------------+------------------+---------------------+----------------+--------------------+\n", | ||
"|15.26| 14.84| 0.871| 5.763| 3.312| 2.221| 5.22|[15.26,14.84,0.87...|\n", | ||
"|14.88| 14.57| 0.8811|5.553999999999999| 3.333| 1.018| 4.956|[14.88,14.57,0.88...|\n", | ||
"|14.29| 14.09| 0.905| 5.291|3.3369999999999997| 2.699| 4.825|[14.29,14.09,0.90...|\n", | ||
"+-----+---------+-----------+-----------------+------------------+---------------------+----------------+--------------------+\n", | ||
"only showing top 3 rows\n", | ||
"\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"from pyspark.ml.feature import VectorAssembler\n", | ||
"from pyspark.ml.clustering import KMeans\n", | ||
"\n", | ||
"assembler = VectorAssembler(inputCols = df.columns, outputCol = 'features')\n", | ||
"final_df = assembler.transform(df)\n", | ||
"final_df.show(3)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 4, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"root\n", | ||
" |-- area: double (nullable = true)\n", | ||
" |-- perimeter: double (nullable = true)\n", | ||
" |-- compactness: double (nullable = true)\n", | ||
" |-- length_of_kernel: double (nullable = true)\n", | ||
" |-- width_of_kernel: double (nullable = true)\n", | ||
" |-- asymmetry_coefficient: double (nullable = true)\n", | ||
" |-- length_of_groove: double (nullable = true)\n", | ||
" |-- features: vector (nullable = true)\n", | ||
"\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"final_df.printSchema()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 6, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"+-----+---------+-----------+-----------------+------------------+---------------------+----------------+--------------------+--------------------+\n", | ||
"| area|perimeter|compactness| length_of_kernel| width_of_kernel|asymmetry_coefficient|length_of_groove| features| scaledFeatures|\n", | ||
"+-----+---------+-----------+-----------------+------------------+---------------------+----------------+--------------------+--------------------+\n", | ||
"|15.26| 14.84| 0.871| 5.763| 3.312| 2.221| 5.22|[15.26,14.84,0.87...|[5.24452795332028...|\n", | ||
"|14.88| 14.57| 0.8811|5.553999999999999| 3.333| 1.018| 4.956|[14.88,14.57,0.88...|[5.11393027165175...|\n", | ||
"|14.29| 14.09| 0.905| 5.291|3.3369999999999997| 2.699| 4.825|[14.29,14.09,0.90...|[4.91116018695588...|\n", | ||
"+-----+---------+-----------+-----------------+------------------+---------------------+----------------+--------------------+--------------------+\n", | ||
"only showing top 3 rows\n", | ||
"\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"from pyspark.ml.feature import StandardScaler\n", | ||
"\n", | ||
"scaler = StandardScaler(inputCol = 'features', outputCol = 'scaledFeatures')\n", | ||
"scaler_model = scaler.fit(final_df)\n", | ||
"final_df = scaler_model.transform(final_df)\n", | ||
"final_df.show(3)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 7, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"[Row(area=15.26, perimeter=14.84, compactness=0.871, length_of_kernel=5.763, width_of_kernel=3.312, asymmetry_coefficient=2.221, length_of_groove=5.22, features=DenseVector([15.26, 14.84, 0.871, 5.763, 3.312, 2.221, 5.22]), scaledFeatures=DenseVector([5.2445, 11.3633, 36.8608, 13.0072, 8.7685, 1.4772, 10.621]))]" | ||
] | ||
}, | ||
"execution_count": 7, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"final_df.take(1)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 8, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"kmeans = KMeans(featuresCol = 'scaledFeatures', k=3)\n", | ||
"model = kmeans.fit(final_df)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 9, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"WSSSE: 428.60820118716356\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"print('WSSSE:', model.computeCost(final_df))" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 11, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"[array([ 4.07497225, 10.14410142, 35.89816849, 11.80812742, 7.54416916,\n", | ||
" 3.15410901, 10.38031464]), array([ 6.35645488, 12.40730852, 37.41990178, 13.93860446, 9.7892399 ,\n", | ||
" 2.41585013, 12.29286107]), array([ 4.96198582, 10.97871333, 37.30930808, 12.44647267, 8.62880781,\n", | ||
" 1.80061978, 10.41913733])]\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"centers = model.clusterCenters()\n", | ||
"print(centers)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 14, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"+--------------------+----------+\n", | ||
"| scaledFeatures|prediction|\n", | ||
"+--------------------+----------+\n", | ||
"|[5.24452795332028...| 2|\n", | ||
"|[5.11393027165175...| 2|\n", | ||
"|[4.91116018695588...| 2|\n", | ||
"|[4.75650503761158...| 2|\n", | ||
"|[5.54696468981581...| 2|\n", | ||
"|[4.94209121682475...| 2|\n", | ||
"|[5.04863143081749...| 2|\n", | ||
"|[4.84929812721816...| 2|\n", | ||
"|[5.71536696354628...| 1|\n", | ||
"|[5.65006812271202...| 2|\n", | ||
"|[5.24452795332028...| 2|\n", | ||
"|[4.82180387844584...| 2|\n", | ||
"|[4.77368894309428...| 2|\n", | ||
"|[4.73588435103234...| 2|\n", | ||
"|[4.72213722664617...| 2|\n", | ||
"|[5.01426361985209...| 2|\n", | ||
"|[4.80805675405968...| 2|\n", | ||
"|[5.39230954047151...| 2|\n", | ||
"|[5.05206821191403...| 2|\n", | ||
"|[4.37158555479908...| 0|\n", | ||
"+--------------------+----------+\n", | ||
"only showing top 20 rows\n", | ||
"\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"model.transform(final_df).select('scaledFeatures', 'prediction').show()" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "conda_python3", | ||
"language": "python", | ||
"name": "conda_python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.6.4" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |