Publishing v2.3.9

jhu-manulife · Oct 21, 2022 · 6b8933a · 6b8933a
1 parent 9f64499
commit 6b8933a
Show file tree

Hide file tree

Showing 28 changed files with 696 additions and 368 deletions.
diff --git a/04 - ETL with Spark SQL/DE 4.5L - Extract and Load Data Lab.sql b/04 - ETL with Spark SQL/DE 4.5L - Extract and Load Data Lab.sql
@@ -154,8 +154,14 @@
 -- COMMAND ----------
 
 -- MAGIC %python
+-- MAGIC import pyspark.sql.functions as F
 -- MAGIC assert spark.table("events_raw").count() == 2252, "The table should have 2252 records"
--- MAGIC assert set(row['timestamp'] for row in spark.table("events_raw").select("timestamp").limit(5).collect()) == {1593880885085, 1593880892303, 1593880889174, 1593880886106, 1593880889725}, "Make sure you have not modified the data provided"
+-- MAGIC 
+-- MAGIC first_5 = [row['timestamp'] for row in spark.table("events_raw").select("timestamp").orderBy(F.col("timestamp").asc()).limit(5).collect()]
+-- MAGIC assert first_5 == [1593879303631, 1593879304224, 1593879305465, 1593879305482, 1593879305746], "Make sure you have not modified the data provided"
+-- MAGIC 
+-- MAGIC last_5 = [row['timestamp'] for row in spark.table("events_raw").select("timestamp").orderBy(F.col("timestamp").desc()).limit(5).collect()]
+-- MAGIC assert last_5 == [1593881096290, 1593881095799, 1593881093452, 1593881093394, 1593881092076], "Make sure you have not modified the data provided"
 
 -- COMMAND ----------
 

diff --git a/08 - Delta Live Tables/DE 8.1 - DLT/DE 8.1.1 - DLT UI Walkthrough.py b/08 - Delta Live Tables/DE 8.1 - DLT/DE 8.1.1 - DLT UI Walkthrough.py
@@ -59,28 +59,29 @@
 # MAGIC 1. Select the **Delta Live Tables** tab.
 # MAGIC 1. Click **Create Pipeline**.
 # MAGIC 1. Leave **Product Edition** as **Advanced**.
-# MAGIC 1. Fill in a **Pipeline Name** - because these names must be unique, we suggest using the **`Pipeline Name`** provided by the cell above.
+# MAGIC 1. Fill in a **Pipeline Name** - because these names must be unique, we suggest using the **Pipeline Name** provided in the cell above.
 # MAGIC 1. For **Notebook Libraries**, use the navigator to locate and select the notebook specified above.
 # MAGIC    * Even though this document is a standard Databricks Notebook, the SQL syntax is specialized to DLT table declarations.
 # MAGIC    * We will be exploring the syntax in the exercise that follows.
 # MAGIC 1. Under **Configuration**, add two configuration parameters:
 # MAGIC    * Click **Add configuration**, set the "key" to **spark.master** and the "value" to **local[\*]**.
 # MAGIC    * Click **Add configuration**, set the "key" to **datasets_path** and the "value" to the value provided in the cell above.
 # MAGIC 1. In the **Target** field, enter the database name provided in the cell above.<br/>
-# MAGIC This should follow the pattern **`da_<name>_<hash>_dewd_dlt_demo_81`**
+# MAGIC This should follow the pattern **`<name>_<hash>_dbacademy_dewd_dlt_demo_81`**
 # MAGIC    * This field is optional; if not specified, then tables will not be registered to a metastore, but will still be available in the DBFS. Refer to the <a href="https://docs.databricks.com/data-engineering/delta-live-tables/delta-live-tables-user-guide.html#publish-tables" target="_blank">documentation</a> for more information on this option.
 # MAGIC 1. In the **Storage location** field, enter the path provided in the cell above.
 # MAGIC    * This optional field allows the user to specify a location to store logs, tables, and other information related to pipeline execution. 
 # MAGIC    * If not specified, DLT will automatically generate a directory.
-# MAGIC 1. For **Pipeline Mode**, select **Triggered**
+# MAGIC 1. For **Pipeline Mode**, select **Triggered**.
 # MAGIC    * This field specifies how the pipeline will be run.
 # MAGIC    * **Triggered** pipelines run once and then shut down until the next manual or scheduled update.
 # MAGIC    * **Continuous** pipelines run continuously, ingesting new data as it arrives. Choose the mode based on latency and cost requirements.
-# MAGIC 1. For **Pipeline Mode**, select **Triggered**.
 # MAGIC 1. Uncheck the **Enable autoscaling** box.
 # MAGIC 1. Set the number of **`workers`** to **`0`** (zero).
 # MAGIC    * Along with the **spark.master** config above, this will create a **Single Node** clusters.
-# MAGIC 1. Enable **Photon Acceleration**.
+# MAGIC 1. Check the **Use Photon Acceleration** box.
+# MAGIC 1. For **Channel**, select **Current**
+# MAGIC 1. For **Policy**, select the value provided in the cell above.
 # MAGIC 
 # MAGIC The fields **Enable autoscaling**, **Min Workers** and **Max Workers** control the worker configuration for the underlying cluster processing the pipeline. 
 # MAGIC 

diff --git a/08 - Delta Live Tables/DE 8.2 - DLT Lab/DE 8.2.1L - Lab Instructions.py b/08 - Delta Live Tables/DE 8.2 - DLT Lab/DE 8.2.1L - Lab Instructions.py
@@ -101,13 +101,14 @@
 # MAGIC    * Click **Add configuration**, set the "key" to **datasets_path** and the "value" to the value provided in the cell above.
 # MAGIC    * Click **Add configuration**, set the "key" to **source** and the "value" to the value provided in the cell above.
 # MAGIC 1. In the **Target** field, enter the database name provided in the cell above.<br/>
-# MAGIC This should follow the pattern **`da_<name_<hash>_dewd_dlt_lab_82`**
+# MAGIC This should follow the pattern **`<name>_<hash>_dbacademy_dewd_dlt_lab_82`**
 # MAGIC 1. In the **Storage location** field, enter the path provided in the cell above.
-# MAGIC 1. Enter the location printed next to **`Storage Location`** below in the **Storage Location** field.
 # MAGIC 1. For **Pipeline Mode**, select **Triggered**.
 # MAGIC 1. Uncheck the **Enable autoscaling** box.
 # MAGIC 1. Set the number of **`workers`** to **`0`** (zero).
-# MAGIC 1. Enable **Photon Acceleration**.
+# MAGIC 1. Check the **Use Photon Acceleration** box.
+# MAGIC 1. For **Channel**, select **Current**
+# MAGIC 1. For **Policy**, select the value provided in the cell above.
 # MAGIC 
 # MAGIC Finally, click **Create**.
 

diff --git a/...- Scheduling Tasks with the Jobs UI/DE 9.1.1 - Task Orchestration with Databricks Jobs.py b/...- Scheduling Tasks with the Jobs UI/DE 9.1.1 - Task Orchestration with Databricks Jobs.py
@@ -55,18 +55,21 @@
 # MAGIC 1. Click the **Workflows** button on the sidebar.
 # MAGIC 1. Select the **Delta Live Tables** tab.
 # MAGIC 1. Click **Create Pipeline**.
+# MAGIC 1. Leave **Product Edition** as **Advanced**.
 # MAGIC 1. Fill in a **Pipeline Name** - because these names must be unique, we suggest using the **Pipeline Name** provided in the cell above.
-# MAGIC 1. For **Notebook Libraries**, use the navigator to locate and select the companion notebook provided in the cell above.
-# MAGIC 1. Under **Configuration**, add the two configuration parameters:
+# MAGIC 1. For **Notebook Libraries**, use the navigator to locate and select the notebook specified above.
+# MAGIC 1. Under **Configuration**, add two configuration parameters:
 # MAGIC    * Click **Add configuration**, set the "key" to **spark.master** and the "value" to **local[\*]**.
 # MAGIC    * Click **Add configuration**, set the "key" to **datasets_path** and the "value" to the value provided in the cell above.
 # MAGIC 1. In the **Target** field, enter the database name provided in the cell above.<br/>
-# MAGIC This should follow the pattern **`da_<name>_<hash>_dewd_jobs_demo_91`**
-# MAGIC 1. In the **Storage location** field, copy the directory as printed above.
-# MAGIC 1. For **Pipeline Mode**, select **Triggered**
-# MAGIC 1. Uncheck the **Enable autoscaling** box
+# MAGIC This should follow the pattern **`<name>_<hash>_dbacademy_dewd_jobs_demo_91`**
+# MAGIC 1. In the **Storage location** field, enter the path provided in the cell above.
+# MAGIC 1. For **Pipeline Mode**, select **Triggered**.
+# MAGIC 1. Uncheck the **Enable autoscaling** box.
 # MAGIC 1. Set the number of **`workers`** to **`0`** (zero).
-# MAGIC 1. Enable **Photon Acceleration**.
+# MAGIC 1. Check the **Use Photon Acceleration** box.
+# MAGIC 1. For **Channel**, select **Current**
+# MAGIC 1. For **Policy**, select the value provided in the cell above.
 # MAGIC 
 # MAGIC Finally, click **Create**.
 # MAGIC 

diff --git a/09 - Task Orchestration with Jobs/DE 9.2L - Jobs Lab/DE 9.2.1L - Lab Instructions.py b/09 - Task Orchestration with Jobs/DE 9.2L - Jobs Lab/DE 9.2.1L - Lab Instructions.py
@@ -65,19 +65,22 @@
 # MAGIC 1. Click the **Workflows** button on the sidebar.
 # MAGIC 1. Select the **Delta Live Tables** tab.
 # MAGIC 1. Click **Create Pipeline**.
+# MAGIC 1. Leave **Product Edition** as **Advanced**.
 # MAGIC 1. Fill in a **Pipeline Name** - because these names must be unique, we suggest using the **Pipeline Name** provided in the cell above.
-# MAGIC 1. For **Notebook Libraries**, use the navigator to locate and select the companion notebook provided in the cell above.
-# MAGIC 1. Under **Configuration**, add the three configuration parameters:
+# MAGIC 1. For **Notebook Libraries**, use the navigator to locate and select the notebook specified above.
+# MAGIC 1. Under **Configuration**, add three configuration parameters:
 # MAGIC    * Click **Add configuration**, set the "key" to **spark.master** and the "value" to **local[\*]**.
 # MAGIC    * Click **Add configuration**, set the "key" to **datasets_path** and the "value" to the value provided in the cell above.
 # MAGIC    * Click **Add configuration**, set the "key" to **source** and the "value" to the value provided in the cell above.
 # MAGIC 1. In the **Target** field, enter the database name provided in the cell above.<br/>
-# MAGIC This should follow the pattern **`da_<name>_<hash>_dewd_jobs_lab_92`**
-# MAGIC 1. In the **Storage location** field, copy the directory as printed above.
-# MAGIC 1. For **Pipeline Mode**, select **Triggered**
-# MAGIC 1. Uncheck the **Enable autoscaling** box
+# MAGIC This should follow the pattern **`<name>_<hash>_dbacademy_dewd_jobs_lab_92`**
+# MAGIC 1. In the **Storage location** field, enter the path provided in the cell above.
+# MAGIC 1. For **Pipeline Mode**, select **Triggered**.
+# MAGIC 1. Uncheck the **Enable autoscaling** box.
 # MAGIC 1. Set the number of **`workers`** to **`0`** (zero).
-# MAGIC 1. Enable **Photon Acceleration**.
+# MAGIC 1. Check the **Use Photon Acceleration** box.
+# MAGIC 1. For **Channel**, select **Current**
+# MAGIC 1. For **Policy**, select the value provided in the cell above.
 # MAGIC 
 # MAGIC Finally, click **Create**.
 # MAGIC 

diff --git a/...ries in DBSQL/DE 12.2L - OPTIONAL Capstone/DE 12.2.1L - Instructions and Configuration.py b/...ries in DBSQL/DE 12.2L - OPTIONAL Capstone/DE 12.2.1L - Instructions and Configuration.py
@@ -72,19 +72,22 @@
 # MAGIC 1. Click the **Workflows** button on the sidebar.
 # MAGIC 1. Select the **Delta Live Tables** tab.
 # MAGIC 1. Click **Create Pipeline**.
+# MAGIC 1. Leave **Product Edition** as **Advanced**.
 # MAGIC 1. Fill in a **Pipeline Name** - because these names must be unique, we suggest using the **Pipeline Name** provided in the cell above.
-# MAGIC 1. For **Notebook Libraries**, use the navigator to locate and select the companion notebook provided in the cell above.
-# MAGIC 1. Under **Configuration**, add the three configuration parameters:
+# MAGIC 1. For **Notebook Libraries**, use the navigator to locate and select the notebook specified above.
+# MAGIC 1. Under **Configuration**, add three configuration parameters:
 # MAGIC    * Click **Add configuration**, set the "key" to **spark.master** and the "value" to **local[\*]**.
 # MAGIC    * Click **Add configuration**, set the "key" to **datasets_path** and the "value" to the value provided in the cell above.
 # MAGIC    * Click **Add configuration**, set the "key" to **source** and the "value" to the value provided in the cell above.
 # MAGIC 1. In the **Target** field, enter the database name provided in the cell above.<br/>
-# MAGIC This should follow the pattern **`da_<name>_<hash>_dewd_cap_12`**
-# MAGIC 1. In the **Storage location** field, copy the directory as printed above.
+# MAGIC This should follow the pattern **`<name>_<hash>_dbacademy_dewd_cap_12`**
+# MAGIC 1. In the **Storage location** field, enter the path provided in the cell above.
 # MAGIC 1. For **Pipeline Mode**, select **Continuous**
-# MAGIC 1. Uncheck the **Enable autoscaling** box
+# MAGIC 1. Uncheck the **Enable autoscaling** box.
 # MAGIC 1. Set the number of **`workers`** to **`0`** (zero).
-# MAGIC 1. Enable **Photon Acceleration**.
+# MAGIC 1. Check the **Use Photon Acceleration** box.
+# MAGIC 1. For **Channel**, select **Current**
+# MAGIC 1. For **Policy**, select the value provided in the cell above.
 # MAGIC 1. Click **Create**.
 # MAGIC 1. After the UI updates, change from **Development** to **Production** mode
 # MAGIC 

diff --git a/Includes/Classroom-Setup-08.1.1.py b/Includes/Classroom-Setup-08.1.1.py
@@ -3,6 +3,17 @@
 
 # COMMAND ----------
 
+@DBAcademyHelper.monkey_patch
+def get_dlt_policy(self):
+    from dbacademy.dbhelper import ClustersHelper
+
+    dlt_policy = DA.client.cluster_policies.get_by_name(ClustersHelper.POLICY_DLT_ONLY)
+    assert dlt_policy is not None, f"Could not find the cluster policy \"{ClustersHelper.POLICY_DLT_ONLY}\"; Please run the notebook Includes/Workspace-Setup before proceeding."
+
+    return dlt_policy
+
+# COMMAND ----------
+
 @DBAcademyHelper.monkey_patch
 def get_pipeline_config(self):
     path = dbutils.entry_point.getDbutils().notebook().getContext().notebookPath().getOrElse(None)
@@ -15,6 +26,8 @@ def get_pipeline_config(self):
 @DBAcademyHelper.monkey_patch
 def print_pipeline_config(self):
     "Provided by DBAcademy, this function renders the configuration of the pipeline as HTML"
+    from dbacademy.dbhelper import ClustersHelper
+
     pipeline_name, path = self.get_pipeline_config()
 
     displayHTML(f"""<table style="width:100%">
@@ -33,8 +46,11 @@ def print_pipeline_config(self):
     <tr>
         <td style="white-space:nowrap; width:1em">Datasets Path:</td>
         <td><input type="text" value="{DA.paths.datasets}" style="width:100%"></td></tr>
+    <tr>
+        <td style="white-space:nowrap; width:1em">Policy:</td>
+        <td><input type="text" value="{ClustersHelper.POLICY_DLT_ONLY}" style="width:100%"></td></tr>
     </table>""")
-    
+
 
 # COMMAND ----------
 
@@ -57,18 +73,23 @@ def create_pipeline(self):
             "spark.master": "local[*]",
             "datasets_path": DA.paths.datasets,
         },
-        clusters=[{ "label": "default", "num_workers": 0 }])
+        clusters=[{ 
+            "num_workers": 0,
+            "policy_id": self.get_dlt_policy().get("policy_id")
+        }]
+    )
 
     pipeline_id = response.get("pipeline_id")
-    print(f"Created pipline {pipeline_id}")
-    
+    print(f"Created the pipeline \"{pipeline_name}\" ({pipeline_id})")
+
 
 # COMMAND ----------
 
 @DBAcademyHelper.monkey_patch
 def validate_pipeline_config(self):
     "Provided by DBAcademy, this function validates the configuration of the pipeline"
     import json
+    from dbacademy.dbhelper import ClustersHelper
 
     pipeline_name, path = self.get_pipeline_config()
 
@@ -98,13 +119,20 @@ def validate_pipeline_config(self):
     spark_master = configuration.get("spark.master")
     assert spark_master == f"local[*]", f"Invalid spark.master value. Expected \"local[*]\", found \"{spark_master}\"."
 
+    cluster_count = len(spec.get("clusters"))
+    assert cluster_count == 1, f"Expected one, and only one, cluster configuration, found {cluster_count}. You can use the JSON UI to edit the configuration and remove the extra clusters."
+
     cluster = spec.get("clusters")[0]
     autoscale = cluster.get("autoscale")
     assert autoscale is None, f"Autoscaling should be disabled."
 
     num_workers = cluster.get("num_workers")
     assert num_workers == 0, f"Expected the number of workers to be 0, found {num_workers}."
 
+    policy_id = cluster.get("policy_id")
+    policy_name = None if policy_id is None else self.client.cluster_policies.get_by_id(policy_id).get("name")
+    assert policy_id == self.get_dlt_policy().get("policy_id"), f"Expected the policy to be set to \"{ClustersHelper.POLICY_DLT_ONLY}\", found \"{policy_name}\"."
+
     development = spec.get("development")
     assert development == True, f"The pipline mode should be set to \"Development\"."
 
@@ -117,24 +145,8 @@ def validate_pipeline_config(self):
     continuous = spec.get("continuous")
     assert continuous == False, f"Expected the Pipeline mode to be \"Triggered\", found \"Continuous\"."
 
-    policy = self.client.cluster_policies.get_by_name("Student's DLT-Only Policy")
-    if policy is not None:
-        cluster = { 
-            "num_workers": 0,
-            "label": "default", 
-            "policy_id": policy.get("policy_id")
-        }
-        self.client.pipelines.create_or_update(name = pipeline_name,
-                                               storage = DA.paths.storage_location,
-                                               target = DA.schema_name,
-                                               notebooks = [path],
-                                               configuration = {
-                                                   "spark.master": "local[*]",
-                                                   "datasets_path": DA.paths.datasets,
-                                               },
-                                               clusters=[cluster])
     print("All tests passed!")
-    
+
 
 # COMMAND ----------