Skip to content

Commit

Permalink
Add file
Browse files Browse the repository at this point in the history
  • Loading branch information
susanli2016 committed May 7, 2018
1 parent 06cf8d7 commit 7fc6a21
Showing 1 changed file with 275 additions and 0 deletions.
275 changes: 275 additions & 0 deletions K-Means.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,275 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+-----+---------+-----------+-----------------+------------------+---------------------+----------------+\n",
"| area|perimeter|compactness| length_of_kernel| width_of_kernel|asymmetry_coefficient|length_of_groove|\n",
"+-----+---------+-----------+-----------------+------------------+---------------------+----------------+\n",
"|15.26| 14.84| 0.871| 5.763| 3.312| 2.221| 5.22|\n",
"|14.88| 14.57| 0.8811|5.553999999999999| 3.333| 1.018| 4.956|\n",
"|14.29| 14.09| 0.905| 5.291|3.3369999999999997| 2.699| 4.825|\n",
"+-----+---------+-----------+-----------------+------------------+---------------------+----------------+\n",
"only showing top 3 rows\n",
"\n"
]
}
],
"source": [
"from pyspark.sql import SparkSession\n",
"\n",
"spark = SparkSession.builder.appName('cluster').getOrCreate()\n",
"df = spark.read.csv('seeds_dataset.csv', inferSchema=True, header=True)\n",
"df.show(3)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"root\n",
" |-- area: double (nullable = true)\n",
" |-- perimeter: double (nullable = true)\n",
" |-- compactness: double (nullable = true)\n",
" |-- length_of_kernel: double (nullable = true)\n",
" |-- width_of_kernel: double (nullable = true)\n",
" |-- asymmetry_coefficient: double (nullable = true)\n",
" |-- length_of_groove: double (nullable = true)\n",
"\n"
]
}
],
"source": [
"df.printSchema()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+-----+---------+-----------+-----------------+------------------+---------------------+----------------+--------------------+\n",
"| area|perimeter|compactness| length_of_kernel| width_of_kernel|asymmetry_coefficient|length_of_groove| features|\n",
"+-----+---------+-----------+-----------------+------------------+---------------------+----------------+--------------------+\n",
"|15.26| 14.84| 0.871| 5.763| 3.312| 2.221| 5.22|[15.26,14.84,0.87...|\n",
"|14.88| 14.57| 0.8811|5.553999999999999| 3.333| 1.018| 4.956|[14.88,14.57,0.88...|\n",
"|14.29| 14.09| 0.905| 5.291|3.3369999999999997| 2.699| 4.825|[14.29,14.09,0.90...|\n",
"+-----+---------+-----------+-----------------+------------------+---------------------+----------------+--------------------+\n",
"only showing top 3 rows\n",
"\n"
]
}
],
"source": [
"from pyspark.ml.feature import VectorAssembler\n",
"from pyspark.ml.clustering import KMeans\n",
"\n",
"assembler = VectorAssembler(inputCols = df.columns, outputCol = 'features')\n",
"final_df = assembler.transform(df)\n",
"final_df.show(3)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"root\n",
" |-- area: double (nullable = true)\n",
" |-- perimeter: double (nullable = true)\n",
" |-- compactness: double (nullable = true)\n",
" |-- length_of_kernel: double (nullable = true)\n",
" |-- width_of_kernel: double (nullable = true)\n",
" |-- asymmetry_coefficient: double (nullable = true)\n",
" |-- length_of_groove: double (nullable = true)\n",
" |-- features: vector (nullable = true)\n",
"\n"
]
}
],
"source": [
"final_df.printSchema()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+-----+---------+-----------+-----------------+------------------+---------------------+----------------+--------------------+--------------------+\n",
"| area|perimeter|compactness| length_of_kernel| width_of_kernel|asymmetry_coefficient|length_of_groove| features| scaledFeatures|\n",
"+-----+---------+-----------+-----------------+------------------+---------------------+----------------+--------------------+--------------------+\n",
"|15.26| 14.84| 0.871| 5.763| 3.312| 2.221| 5.22|[15.26,14.84,0.87...|[5.24452795332028...|\n",
"|14.88| 14.57| 0.8811|5.553999999999999| 3.333| 1.018| 4.956|[14.88,14.57,0.88...|[5.11393027165175...|\n",
"|14.29| 14.09| 0.905| 5.291|3.3369999999999997| 2.699| 4.825|[14.29,14.09,0.90...|[4.91116018695588...|\n",
"+-----+---------+-----------+-----------------+------------------+---------------------+----------------+--------------------+--------------------+\n",
"only showing top 3 rows\n",
"\n"
]
}
],
"source": [
"from pyspark.ml.feature import StandardScaler\n",
"\n",
"scaler = StandardScaler(inputCol = 'features', outputCol = 'scaledFeatures')\n",
"scaler_model = scaler.fit(final_df)\n",
"final_df = scaler_model.transform(final_df)\n",
"final_df.show(3)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[Row(area=15.26, perimeter=14.84, compactness=0.871, length_of_kernel=5.763, width_of_kernel=3.312, asymmetry_coefficient=2.221, length_of_groove=5.22, features=DenseVector([15.26, 14.84, 0.871, 5.763, 3.312, 2.221, 5.22]), scaledFeatures=DenseVector([5.2445, 11.3633, 36.8608, 13.0072, 8.7685, 1.4772, 10.621]))]"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"final_df.take(1)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"kmeans = KMeans(featuresCol = 'scaledFeatures', k=3)\n",
"model = kmeans.fit(final_df)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"WSSSE: 428.60820118716356\n"
]
}
],
"source": [
"print('WSSSE:', model.computeCost(final_df))"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[array([ 4.07497225, 10.14410142, 35.89816849, 11.80812742, 7.54416916,\n",
" 3.15410901, 10.38031464]), array([ 6.35645488, 12.40730852, 37.41990178, 13.93860446, 9.7892399 ,\n",
" 2.41585013, 12.29286107]), array([ 4.96198582, 10.97871333, 37.30930808, 12.44647267, 8.62880781,\n",
" 1.80061978, 10.41913733])]\n"
]
}
],
"source": [
"centers = model.clusterCenters()\n",
"print(centers)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+--------------------+----------+\n",
"| scaledFeatures|prediction|\n",
"+--------------------+----------+\n",
"|[5.24452795332028...| 2|\n",
"|[5.11393027165175...| 2|\n",
"|[4.91116018695588...| 2|\n",
"|[4.75650503761158...| 2|\n",
"|[5.54696468981581...| 2|\n",
"|[4.94209121682475...| 2|\n",
"|[5.04863143081749...| 2|\n",
"|[4.84929812721816...| 2|\n",
"|[5.71536696354628...| 1|\n",
"|[5.65006812271202...| 2|\n",
"|[5.24452795332028...| 2|\n",
"|[4.82180387844584...| 2|\n",
"|[4.77368894309428...| 2|\n",
"|[4.73588435103234...| 2|\n",
"|[4.72213722664617...| 2|\n",
"|[5.01426361985209...| 2|\n",
"|[4.80805675405968...| 2|\n",
"|[5.39230954047151...| 2|\n",
"|[5.05206821191403...| 2|\n",
"|[4.37158555479908...| 0|\n",
"+--------------------+----------+\n",
"only showing top 20 rows\n",
"\n"
]
}
],
"source": [
"model.transform(final_df).select('scaledFeatures', 'prediction').show()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "conda_python3",
"language": "python",
"name": "conda_python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

0 comments on commit 7fc6a21

Please sign in to comment.