Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Automation test for spark job with managed vnet and interactive session notebook #2436

Merged
merged 20 commits into from
Jul 17, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
# This code is autogenerated.
# Code is generated by running custom script: python3 readme.py
# Any manual changes to this file may cause incorrect behavior.
# Any manual changes will be overwritten if the code is regenerated.

name: sdk-jobs-spark-automation-run_interactive_session_notebook
# This file is created by sdk/python/readme.py.
# Please do not edit directly.
on:
workflow_dispatch:
schedule:
- cron: "30 11/12 * * *"
pull_request:
branches:
- main
paths:
- sdk/python/jobs/spark/automation/**
- .github/workflows/sdk-jobs-spark-automation-run_interactive_session_notebook.yml
- sdk/python/dev-requirements.txt
- infra/bootstrapping/**
- sdk/python/setup.sh
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
jobs:
build:
runs-on: ubuntu-latest
steps:
- name: check out repo
uses: actions/checkout@v2
- name: setup python
uses: actions/setup-python@v2
with:
python-version: "3.8"
- name: pip install notebook reqs
run: pip install -r sdk/python/dev-requirements.txt
- name: azure login
uses: azure/login@v1
with:
creds: ${{secrets.AZUREML_CREDENTIALS}}
- name: bootstrap resources
run: |
echo '${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}';
bash bootstrap.sh
working-directory: infra/bootstrapping
continue-on-error: false
- name: setup SDK
run: |
source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh";
source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh";
bash setup.sh
working-directory: sdk/python
continue-on-error: true
- name: setup-cli
run: |
source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh";
source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh";
bash setup.sh
working-directory: cli
continue-on-error: true
- name: setup spark resources
run: |
bash -x jobs/spark/setup_spark.sh jobs/spark/ jobs/spark/automation/run_interactive_session_notebook.ipynb
working-directory: sdk/python
continue-on-error: true
- name: run jobs/spark/automation/run_interactive_session_notebook.ipynb
run: |
source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh";
source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh";
bash "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh" generate_workspace_config "../../.azureml/config.json";
bash "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh" replace_template_values "run_interactive_session_notebook.ipynb";
[ -f "../../.azureml/config" ] && cat "../../.azureml/config";
papermill -k python run_interactive_session_notebook.ipynb run_interactive_session_notebook.output.ipynb
working-directory: sdk/python/jobs/spark/automation
- name: upload notebook's working folder as an artifact
if: ${{ always() }}
uses: actions/upload-artifact@v2
with:
name: run_interactive_session_notebook
path: sdk/python/jobs/spark/automation
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
# This code is autogenerated.
# Code is generated by running custom script: python3 readme.py
# Any manual changes to this file may cause incorrect behavior.
# Any manual changes will be overwritten if the code is regenerated.

name: sdk-jobs-spark-submit_spark_standalone_jobs_managed_vnet
# This file is created by sdk/python/readme.py.
# Please do not edit directly.
on:
workflow_dispatch:
schedule:
- cron: "14 1/12 * * *"
pull_request:
branches:
- main
paths:
- sdk/python/jobs/spark/**
- .github/workflows/sdk-jobs-spark-submit_spark_standalone_jobs_managed_vnet.yml
- sdk/python/dev-requirements.txt
- infra/bootstrapping/**
- sdk/python/setup.sh
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
jobs:
build:
runs-on: ubuntu-latest
steps:
- name: check out repo
uses: actions/checkout@v2
- name: setup python
uses: actions/setup-python@v2
with:
python-version: "3.8"
- name: pip install notebook reqs
run: pip install -r sdk/python/dev-requirements.txt
- name: azure login
uses: azure/login@v1
with:
creds: ${{secrets.AZUREML_CREDENTIALS}}
- name: bootstrap resources
run: |
echo '${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}';
bash bootstrap.sh
working-directory: infra/bootstrapping
continue-on-error: false
- name: setup SDK
run: |
source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh";
source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh";
bash setup.sh
working-directory: sdk/python
continue-on-error: true
- name: setup-cli
run: |
source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh";
source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh";
bash setup.sh
working-directory: cli
continue-on-error: true
- name: setup spark resources
run: |
bash -x jobs/spark/setup_spark.sh jobs/spark/ jobs/spark/submit_spark_standalone_jobs_managed_vnet.ipynb
working-directory: sdk/python
continue-on-error: true
- name: run jobs/spark/submit_spark_standalone_jobs_managed_vnet.ipynb
run: |
source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh";
source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh";
bash "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh" generate_workspace_config "../../.azureml/config.json";
bash "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh" replace_template_values "submit_spark_standalone_jobs_managed_vnet.ipynb";
[ -f "../../.azureml/config" ] && cat "../../.azureml/config";
papermill -k python submit_spark_standalone_jobs_managed_vnet.ipynb submit_spark_standalone_jobs_managed_vnet.output.ipynb
working-directory: sdk/python/jobs/spark
- name: upload notebook's working folder as an artifact
if: ${{ always() }}
uses: actions/upload-artifact@v2
with:
name: submit_spark_standalone_jobs_managed_vnet
path: sdk/python/jobs/spark
70 changes: 49 additions & 21 deletions sdk/python/data-wrangling/interactive_data_wrangling.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -49,11 +49,15 @@
"source": [
"from pyspark.sql import SparkSession\n",
"\n",
"key_vault_name = \"<KEY_VAULT_NAME>\"\n",
"access_key_secret_name = \"<ACCESS_KEY_SECRET_NAME>\"\n",
"storage_account_name = \"<STORAGE_ACCOUNT_NAME>\"\n",
"\n",
"sc = SparkSession.builder.getOrCreate()\n",
"token_library = sc._jvm.com.microsoft.azure.synapse.tokenlibrary.TokenLibrary\n",
"access_key = token_library.getSecret(\"<KEY_VAULT_NAME>\", \"<ACCESS_KEY_SECRET_NAME>\")\n",
"access_key = token_library.getSecret(key_vault_name, access_key_secret_name)\n",
"sc._jsc.hadoopConfiguration().set(\n",
" \"fs.azure.account.key.<STORAGE_ACCOUNT_NAME>.blob.core.windows.net\", access_key\n",
" f\"fs.azure.account.key.{storage_account_name}.blob.core.windows.net\", access_key\n",
")"
]
},
Expand Down Expand Up @@ -84,8 +88,11 @@
"import pyspark.pandas as pd\n",
"from pyspark.ml.feature import Imputer\n",
"\n",
"blob_container_name = \"<BLOB_CONTAINER_NAME>\"\n",
"storage_account_name = \"<STORAGE_ACCOUNT_NAME>\"\n",
"\n",
"df = pd.read_csv(\n",
" \"wasbs://<BLOB_CONTAINER_NAME>@<STORAGE_ACCOUNT_NAME>.blob.core.windows.net/data/titanic.csv\",\n",
" f\"wasbs://{blob_container_name}@{storage_account_name}.blob.core.windows.net/data/titanic.csv\",\n",
" index_col=\"PassengerId\",\n",
")\n",
"imputer = Imputer(inputCols=[\"Age\"], outputCol=\"Age\").setStrategy(\n",
Expand All @@ -96,7 +103,7 @@
") # Fill Cabin column with value \"None\" if missing\n",
"df.dropna(inplace=True) # Drop the rows which still have any missing value\n",
"df.to_csv(\n",
" \"wasbs://<BLOB_CONTAINER_NAME>@<STORAGE_ACCOUNT_NAME>.blob.core.windows.net/data/wrangled\",\n",
" f\"wasbs://{blob_container_name}@{storage_account_name}.blob.core.windows.net/data/wrangled\",\n",
" index_col=\"PassengerId\",\n",
")"
]
Expand Down Expand Up @@ -141,11 +148,16 @@
"source": [
"from pyspark.sql import SparkSession\n",
"\n",
"key_vault_name = \"<KEY_VAULT_NAME>\"\n",
"sas_token_secret_name = \"<SAS_TOKEN_SECRET_NAME>\"\n",
"blob_container_name = \"<BLOB_CONTAINER_NAME>\"\n",
"storage_account_name = \"<STORAGE_ACCOUNT_NAME>\"\n",
"\n",
"sc = SparkSession.builder.getOrCreate()\n",
"token_library = sc._jvm.com.microsoft.azure.synapse.tokenlibrary.TokenLibrary\n",
"sas_token = token_library.getSecret(\"<KEY_VAULT_NAME>\", \"<SAS_TOKEN_SECRET_NAME>\")\n",
"sas_token = token_library.getSecret(key_vault_name, sas_token_secret_name)\n",
"sc._jsc.hadoopConfiguration().set(\n",
" \"fs.azure.sas.<BLOB_CONTAINER_NAME>.<STORAGE_ACCOUNT_NAME>.blob.core.windows.net\",\n",
" f\"fs.azure.sas.{blob_container_name}.{storage_account_name}.blob.core.windows.net\",\n",
" sas_token,\n",
")"
]
Expand Down Expand Up @@ -177,8 +189,11 @@
"import pyspark.pandas as pd\n",
"from pyspark.ml.feature import Imputer\n",
"\n",
"blob_container_name = \"<BLOB_CONTAINER_NAME>\"\n",
"storage_account_name = \"<STORAGE_ACCOUNT_NAME>\"\n",
"\n",
"df = pd.read_csv(\n",
" \"wasbs://<BLOB_CONTAINER_NAME>@<STORAGE_ACCOUNT_NAME>.blob.core.windows.net/data/titanic.csv\",\n",
" f\"wasbs://{blob_container_name}@{storage_account_name}.blob.core.windows.net/data/titanic.csv\",\n",
" index_col=\"PassengerId\",\n",
")\n",
"imputer = Imputer(inputCols=[\"Age\"], outputCol=\"Age\").setStrategy(\n",
Expand All @@ -189,7 +204,7 @@
") # Fill Cabin column with value \"None\" if missing\n",
"df.dropna(inplace=True) # Drop the rows which still have any missing value\n",
"df.to_csv(\n",
" \"wasbs://<BLOB_CONTAINER_NAME>@<STORAGE_ACCOUNT_NAME>.blob.core.windows.net/data/wrangled\",\n",
" f\"wasbs://{blob_container_name}@{storage_account_name}.blob.core.windows.net/data/wrangled\",\n",
" index_col=\"PassengerId\",\n",
")"
]
Expand Down Expand Up @@ -236,8 +251,11 @@
"import pyspark.pandas as pd\n",
"from pyspark.ml.feature import Imputer\n",
"\n",
"file_system_name = \"<FILE_SYSTEM_NAME>\"\n",
"gen2_storage_account_name = \"<GEN2_STORAGE_ACCOUNT_NAME>\"\n",
"\n",
"df = pd.read_csv(\n",
" \"abfss://<FILE_SYSTEM_NAME>@<STORAGE_ACCOUNT_NAME>.dfs.core.windows.net/data/titanic.csv\",\n",
" f\"abfss://{file_system_name}@{gen2_storage_account_name}.dfs.core.windows.net/data/titanic.csv\",\n",
" index_col=\"PassengerId\",\n",
")\n",
"imputer = Imputer(inputCols=[\"Age\"], outputCol=\"Age\").setStrategy(\n",
Expand All @@ -248,7 +266,7 @@
") # Fill Cabin column with value \"None\" if missing\n",
"df.dropna(inplace=True) # Drop the rows which still have any missing value\n",
"df.to_csv(\n",
" \"abfss://<FILE_SYSTEM_NAME>@<STORAGE_ACCOUNT_NAME>.dfs.core.windows.net/data/wrangled\",\n",
" f\"abfss://{file_system_name}@{gen2_storage_account_name}.dfs.core.windows.net/data/wrangled\",\n",
" index_col=\"PassengerId\",\n",
")"
]
Expand All @@ -272,7 +290,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"- To enable read and write access, assign **Contributor** and **Storage Blob Data Contributor** roles to the user identity.\n",
"- To enable read and write access, assign **Contributor** and **Storage Blob Data Contributor** roles to the Service Principal.\n",
"- Set configuration properties as follows:\n",
" - Client ID property: `fs.azure.account.oauth2.client.id.<STORAGE_ACCOUNT_NAME>.dfs.core.windows.net`\n",
" - Client secret property: `fs.azure.account.oauth2.client.secret.<STORAGE_ACCOUNT_NAME>.dfs.core.windows.net`\n",
Expand All @@ -298,32 +316,39 @@
"source": [
"from pyspark.sql import SparkSession\n",
"\n",
"key_vault_name = \"<KEY_VAULT_NAME>\"\n",
"client_id_secret_name = \"<CLIENT_ID_SECRET_NAME>\"\n",
"tenant_id_secret_name = \"<TENANT_ID_SECRET_NAME>\"\n",
"client_secret_name = \"<CLIENT_SECRET_NAME>\"\n",
"gen2_storage_account_name = \"<GEN2_STORAGE_ACCOUNT_NAME>\"\n",
"\n",
"sc = SparkSession.builder.getOrCreate()\n",
"token_library = sc._jvm.com.microsoft.azure.synapse.tokenlibrary.TokenLibrary\n",
"\n",
"# Set up service principal tenant ID, client ID and secret from Azure Key Vault\n",
"client_id = token_library.getSecret(\"<KEY_VAULT_NAME>\", \"<CLIENT_ID_SECRET_NAME>\")\n",
"tenant_id = token_library.getSecret(\"<KEY_VAULT_NAME>\", \"<TENANT_ID_SECRET_NAME>\")\n",
"client_secret = token_library.getSecret(\"<KEY_VAULT_NAME>\", \"<CLIENT_SECRET_NAME>\")\n",
"client_id = token_library.getSecret(key_vault_name, client_id_secret_name)\n",
"tenant_id = token_library.getSecret(key_vault_name, tenant_id_secret_name)\n",
"client_secret = token_library.getSecret(key_vault_name, client_secret_name)\n",
"\n",
"# Set up service principal which has access of the data\n",
"sc._jsc.hadoopConfiguration().set(\n",
" \"fs.azure.account.auth.type.<STORAGE_ACCOUNT_NAME>.dfs.core.windows.net\", \"OAuth\"\n",
" f\"fs.azure.account.auth.type.{gen2_storage_account_name}.dfs.core.windows.net\",\n",
" \"OAuth\",\n",
")\n",
"sc._jsc.hadoopConfiguration().set(\n",
" \"fs.azure.account.oauth.provider.type.<STORAGE_ACCOUNT_NAME>.dfs.core.windows.net\",\n",
" f\"fs.azure.account.oauth.provider.type.{gen2_storage_account_name}.dfs.core.windows.net\",\n",
" \"org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider\",\n",
")\n",
"sc._jsc.hadoopConfiguration().set(\n",
" \"fs.azure.account.oauth2.client.id.<STORAGE_ACCOUNT_NAME>.dfs.core.windows.net\",\n",
" f\"fs.azure.account.oauth2.client.id.{gen2_storage_account_name}.dfs.core.windows.net\",\n",
" client_id,\n",
")\n",
"sc._jsc.hadoopConfiguration().set(\n",
" \"fs.azure.account.oauth2.client.secret.<STORAGE_ACCOUNT_NAME>.dfs.core.windows.net\",\n",
" f\"fs.azure.account.oauth2.client.secret.{gen2_storage_account_name}.dfs.core.windows.net\",\n",
" client_secret,\n",
")\n",
"sc._jsc.hadoopConfiguration().set(\n",
" \"fs.azure.account.oauth2.client.endpoint.<STORAGE_ACCOUNT_NAME>.dfs.core.windows.net\",\n",
" f\"fs.azure.account.oauth2.client.endpoint.{gen2_storage_account_name}.dfs.core.windows.net\",\n",
" \"https://login.microsoftonline.com/\" + tenant_id + \"/oauth2/token\",\n",
")"
]
Expand Down Expand Up @@ -355,8 +380,11 @@
"import pyspark.pandas as pd\n",
"from pyspark.ml.feature import Imputer\n",
"\n",
"file_system_name = \"<FILE_SYSTEM_NAME>\"\n",
"gen2_storage_account_name = \"<GEN2_STORAGE_ACCOUNT_NAME>\"\n",
"\n",
"df = pd.read_csv(\n",
" \"abfss://<FILE_SYSTEM_NAME>@<STORAGE_ACCOUNT_NAME>.dfs.core.windows.net/data/titanic.csv\",\n",
" f\"abfss://{file_system_name}@{gen2_storage_account_name}.dfs.core.windows.net/data/titanic.csv\",\n",
" index_col=\"PassengerId\",\n",
")\n",
"imputer = Imputer(inputCols=[\"Age\"], outputCol=\"Age\").setStrategy(\n",
Expand All @@ -367,7 +395,7 @@
") # Fill Cabin column with value \"None\" if missing\n",
"df.dropna(inplace=True) # Drop the rows which still have any missing value\n",
"df.to_csv(\n",
" \"abfss://<FILE_SYSTEM_NAME>@<STORAGE_ACCOUNT_NAME>.dfs.core.windows.net/data/wrangled\",\n",
" f\"abfss://{file_system_name}@{gen2_storage_account_name}.dfs.core.windows.net/data/wrangled\",\n",
" index_col=\"PassengerId\",\n",
")"
]
Expand Down
Loading
Loading