Skip to content

Commit

Permalink
Publishing v2.3.9
Browse files Browse the repository at this point in the history
  • Loading branch information
SireInsectus authored and SireInsectus committed Oct 21, 2022
1 parent 9f64499 commit 6b8933a
Show file tree
Hide file tree
Showing 28 changed files with 696 additions and 368 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -154,8 +154,14 @@
-- COMMAND ----------

-- MAGIC %python
-- MAGIC import pyspark.sql.functions as F
-- MAGIC assert spark.table("events_raw").count() == 2252, "The table should have 2252 records"
-- MAGIC assert set(row['timestamp'] for row in spark.table("events_raw").select("timestamp").limit(5).collect()) == {1593880885085, 1593880892303, 1593880889174, 1593880886106, 1593880889725}, "Make sure you have not modified the data provided"
-- MAGIC
-- MAGIC first_5 = [row['timestamp'] for row in spark.table("events_raw").select("timestamp").orderBy(F.col("timestamp").asc()).limit(5).collect()]
-- MAGIC assert first_5 == [1593879303631, 1593879304224, 1593879305465, 1593879305482, 1593879305746], "Make sure you have not modified the data provided"
-- MAGIC
-- MAGIC last_5 = [row['timestamp'] for row in spark.table("events_raw").select("timestamp").orderBy(F.col("timestamp").desc()).limit(5).collect()]
-- MAGIC assert last_5 == [1593881096290, 1593881095799, 1593881093452, 1593881093394, 1593881092076], "Make sure you have not modified the data provided"

-- COMMAND ----------

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,28 +59,29 @@
# MAGIC 1. Select the **Delta Live Tables** tab.
# MAGIC 1. Click **Create Pipeline**.
# MAGIC 1. Leave **Product Edition** as **Advanced**.
# MAGIC 1. Fill in a **Pipeline Name** - because these names must be unique, we suggest using the **`Pipeline Name`** provided by the cell above.
# MAGIC 1. Fill in a **Pipeline Name** - because these names must be unique, we suggest using the **Pipeline Name** provided in the cell above.
# MAGIC 1. For **Notebook Libraries**, use the navigator to locate and select the notebook specified above.
# MAGIC * Even though this document is a standard Databricks Notebook, the SQL syntax is specialized to DLT table declarations.
# MAGIC * We will be exploring the syntax in the exercise that follows.
# MAGIC 1. Under **Configuration**, add two configuration parameters:
# MAGIC * Click **Add configuration**, set the "key" to **spark.master** and the "value" to **local[\*]**.
# MAGIC * Click **Add configuration**, set the "key" to **datasets_path** and the "value" to the value provided in the cell above.
# MAGIC 1. In the **Target** field, enter the database name provided in the cell above.<br/>
# MAGIC This should follow the pattern **`da_<name>_<hash>_dewd_dlt_demo_81`**
# MAGIC This should follow the pattern **`<name>_<hash>_dbacademy_dewd_dlt_demo_81`**
# MAGIC * This field is optional; if not specified, then tables will not be registered to a metastore, but will still be available in the DBFS. Refer to the <a href="https://docs.databricks.com/data-engineering/delta-live-tables/delta-live-tables-user-guide.html#publish-tables" target="_blank">documentation</a> for more information on this option.
# MAGIC 1. In the **Storage location** field, enter the path provided in the cell above.
# MAGIC * This optional field allows the user to specify a location to store logs, tables, and other information related to pipeline execution.
# MAGIC * If not specified, DLT will automatically generate a directory.
# MAGIC 1. For **Pipeline Mode**, select **Triggered**
# MAGIC 1. For **Pipeline Mode**, select **Triggered**.
# MAGIC * This field specifies how the pipeline will be run.
# MAGIC * **Triggered** pipelines run once and then shut down until the next manual or scheduled update.
# MAGIC * **Continuous** pipelines run continuously, ingesting new data as it arrives. Choose the mode based on latency and cost requirements.
# MAGIC 1. For **Pipeline Mode**, select **Triggered**.
# MAGIC 1. Uncheck the **Enable autoscaling** box.
# MAGIC 1. Set the number of **`workers`** to **`0`** (zero).
# MAGIC * Along with the **spark.master** config above, this will create a **Single Node** clusters.
# MAGIC 1. Enable **Photon Acceleration**.
# MAGIC 1. Check the **Use Photon Acceleration** box.
# MAGIC 1. For **Channel**, select **Current**
# MAGIC 1. For **Policy**, select the value provided in the cell above.
# MAGIC
# MAGIC The fields **Enable autoscaling**, **Min Workers** and **Max Workers** control the worker configuration for the underlying cluster processing the pipeline.
# MAGIC
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -101,13 +101,14 @@
# MAGIC * Click **Add configuration**, set the "key" to **datasets_path** and the "value" to the value provided in the cell above.
# MAGIC * Click **Add configuration**, set the "key" to **source** and the "value" to the value provided in the cell above.
# MAGIC 1. In the **Target** field, enter the database name provided in the cell above.<br/>
# MAGIC This should follow the pattern **`da_<name_<hash>_dewd_dlt_lab_82`**
# MAGIC This should follow the pattern **`<name>_<hash>_dbacademy_dewd_dlt_lab_82`**
# MAGIC 1. In the **Storage location** field, enter the path provided in the cell above.
# MAGIC 1. Enter the location printed next to **`Storage Location`** below in the **Storage Location** field.
# MAGIC 1. For **Pipeline Mode**, select **Triggered**.
# MAGIC 1. Uncheck the **Enable autoscaling** box.
# MAGIC 1. Set the number of **`workers`** to **`0`** (zero).
# MAGIC 1. Enable **Photon Acceleration**.
# MAGIC 1. Check the **Use Photon Acceleration** box.
# MAGIC 1. For **Channel**, select **Current**
# MAGIC 1. For **Policy**, select the value provided in the cell above.
# MAGIC
# MAGIC Finally, click **Create**.

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,18 +55,21 @@
# MAGIC 1. Click the **Workflows** button on the sidebar.
# MAGIC 1. Select the **Delta Live Tables** tab.
# MAGIC 1. Click **Create Pipeline**.
# MAGIC 1. Leave **Product Edition** as **Advanced**.
# MAGIC 1. Fill in a **Pipeline Name** - because these names must be unique, we suggest using the **Pipeline Name** provided in the cell above.
# MAGIC 1. For **Notebook Libraries**, use the navigator to locate and select the companion notebook provided in the cell above.
# MAGIC 1. Under **Configuration**, add the two configuration parameters:
# MAGIC 1. For **Notebook Libraries**, use the navigator to locate and select the notebook specified above.
# MAGIC 1. Under **Configuration**, add two configuration parameters:
# MAGIC * Click **Add configuration**, set the "key" to **spark.master** and the "value" to **local[\*]**.
# MAGIC * Click **Add configuration**, set the "key" to **datasets_path** and the "value" to the value provided in the cell above.
# MAGIC 1. In the **Target** field, enter the database name provided in the cell above.<br/>
# MAGIC This should follow the pattern **`da_<name>_<hash>_dewd_jobs_demo_91`**
# MAGIC 1. In the **Storage location** field, copy the directory as printed above.
# MAGIC 1. For **Pipeline Mode**, select **Triggered**
# MAGIC 1. Uncheck the **Enable autoscaling** box
# MAGIC This should follow the pattern **`<name>_<hash>_dbacademy_dewd_jobs_demo_91`**
# MAGIC 1. In the **Storage location** field, enter the path provided in the cell above.
# MAGIC 1. For **Pipeline Mode**, select **Triggered**.
# MAGIC 1. Uncheck the **Enable autoscaling** box.
# MAGIC 1. Set the number of **`workers`** to **`0`** (zero).
# MAGIC 1. Enable **Photon Acceleration**.
# MAGIC 1. Check the **Use Photon Acceleration** box.
# MAGIC 1. For **Channel**, select **Current**
# MAGIC 1. For **Policy**, select the value provided in the cell above.
# MAGIC
# MAGIC Finally, click **Create**.
# MAGIC
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,19 +65,22 @@
# MAGIC 1. Click the **Workflows** button on the sidebar.
# MAGIC 1. Select the **Delta Live Tables** tab.
# MAGIC 1. Click **Create Pipeline**.
# MAGIC 1. Leave **Product Edition** as **Advanced**.
# MAGIC 1. Fill in a **Pipeline Name** - because these names must be unique, we suggest using the **Pipeline Name** provided in the cell above.
# MAGIC 1. For **Notebook Libraries**, use the navigator to locate and select the companion notebook provided in the cell above.
# MAGIC 1. Under **Configuration**, add the three configuration parameters:
# MAGIC 1. For **Notebook Libraries**, use the navigator to locate and select the notebook specified above.
# MAGIC 1. Under **Configuration**, add three configuration parameters:
# MAGIC * Click **Add configuration**, set the "key" to **spark.master** and the "value" to **local[\*]**.
# MAGIC * Click **Add configuration**, set the "key" to **datasets_path** and the "value" to the value provided in the cell above.
# MAGIC * Click **Add configuration**, set the "key" to **source** and the "value" to the value provided in the cell above.
# MAGIC 1. In the **Target** field, enter the database name provided in the cell above.<br/>
# MAGIC This should follow the pattern **`da_<name>_<hash>_dewd_jobs_lab_92`**
# MAGIC 1. In the **Storage location** field, copy the directory as printed above.
# MAGIC 1. For **Pipeline Mode**, select **Triggered**
# MAGIC 1. Uncheck the **Enable autoscaling** box
# MAGIC This should follow the pattern **`<name>_<hash>_dbacademy_dewd_jobs_lab_92`**
# MAGIC 1. In the **Storage location** field, enter the path provided in the cell above.
# MAGIC 1. For **Pipeline Mode**, select **Triggered**.
# MAGIC 1. Uncheck the **Enable autoscaling** box.
# MAGIC 1. Set the number of **`workers`** to **`0`** (zero).
# MAGIC 1. Enable **Photon Acceleration**.
# MAGIC 1. Check the **Use Photon Acceleration** box.
# MAGIC 1. For **Channel**, select **Current**
# MAGIC 1. For **Policy**, select the value provided in the cell above.
# MAGIC
# MAGIC Finally, click **Create**.
# MAGIC
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,19 +72,22 @@
# MAGIC 1. Click the **Workflows** button on the sidebar.
# MAGIC 1. Select the **Delta Live Tables** tab.
# MAGIC 1. Click **Create Pipeline**.
# MAGIC 1. Leave **Product Edition** as **Advanced**.
# MAGIC 1. Fill in a **Pipeline Name** - because these names must be unique, we suggest using the **Pipeline Name** provided in the cell above.
# MAGIC 1. For **Notebook Libraries**, use the navigator to locate and select the companion notebook provided in the cell above.
# MAGIC 1. Under **Configuration**, add the three configuration parameters:
# MAGIC 1. For **Notebook Libraries**, use the navigator to locate and select the notebook specified above.
# MAGIC 1. Under **Configuration**, add three configuration parameters:
# MAGIC * Click **Add configuration**, set the "key" to **spark.master** and the "value" to **local[\*]**.
# MAGIC * Click **Add configuration**, set the "key" to **datasets_path** and the "value" to the value provided in the cell above.
# MAGIC * Click **Add configuration**, set the "key" to **source** and the "value" to the value provided in the cell above.
# MAGIC 1. In the **Target** field, enter the database name provided in the cell above.<br/>
# MAGIC This should follow the pattern **`da_<name>_<hash>_dewd_cap_12`**
# MAGIC 1. In the **Storage location** field, copy the directory as printed above.
# MAGIC This should follow the pattern **`<name>_<hash>_dbacademy_dewd_cap_12`**
# MAGIC 1. In the **Storage location** field, enter the path provided in the cell above.
# MAGIC 1. For **Pipeline Mode**, select **Continuous**
# MAGIC 1. Uncheck the **Enable autoscaling** box
# MAGIC 1. Uncheck the **Enable autoscaling** box.
# MAGIC 1. Set the number of **`workers`** to **`0`** (zero).
# MAGIC 1. Enable **Photon Acceleration**.
# MAGIC 1. Check the **Use Photon Acceleration** box.
# MAGIC 1. For **Channel**, select **Current**
# MAGIC 1. For **Policy**, select the value provided in the cell above.
# MAGIC 1. Click **Create**.
# MAGIC 1. After the UI updates, change from **Development** to **Production** mode
# MAGIC
Expand Down
54 changes: 33 additions & 21 deletions Includes/Classroom-Setup-08.1.1.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,17 @@

# COMMAND ----------

@DBAcademyHelper.monkey_patch
def get_dlt_policy(self):
from dbacademy.dbhelper import ClustersHelper

dlt_policy = DA.client.cluster_policies.get_by_name(ClustersHelper.POLICY_DLT_ONLY)
assert dlt_policy is not None, f"Could not find the cluster policy \"{ClustersHelper.POLICY_DLT_ONLY}\"; Please run the notebook Includes/Workspace-Setup before proceeding."

return dlt_policy

# COMMAND ----------

@DBAcademyHelper.monkey_patch
def get_pipeline_config(self):
path = dbutils.entry_point.getDbutils().notebook().getContext().notebookPath().getOrElse(None)
Expand All @@ -15,6 +26,8 @@ def get_pipeline_config(self):
@DBAcademyHelper.monkey_patch
def print_pipeline_config(self):
"Provided by DBAcademy, this function renders the configuration of the pipeline as HTML"
from dbacademy.dbhelper import ClustersHelper

pipeline_name, path = self.get_pipeline_config()

displayHTML(f"""<table style="width:100%">
Expand All @@ -33,8 +46,11 @@ def print_pipeline_config(self):
<tr>
<td style="white-space:nowrap; width:1em">Datasets Path:</td>
<td><input type="text" value="{DA.paths.datasets}" style="width:100%"></td></tr>
<tr>
<td style="white-space:nowrap; width:1em">Policy:</td>
<td><input type="text" value="{ClustersHelper.POLICY_DLT_ONLY}" style="width:100%"></td></tr>
</table>""")


# COMMAND ----------

Expand All @@ -57,18 +73,23 @@ def create_pipeline(self):
"spark.master": "local[*]",
"datasets_path": DA.paths.datasets,
},
clusters=[{ "label": "default", "num_workers": 0 }])
clusters=[{
"num_workers": 0,
"policy_id": self.get_dlt_policy().get("policy_id")
}]
)

pipeline_id = response.get("pipeline_id")
print(f"Created pipline {pipeline_id}")
print(f"Created the pipeline \"{pipeline_name}\" ({pipeline_id})")


# COMMAND ----------

@DBAcademyHelper.monkey_patch
def validate_pipeline_config(self):
"Provided by DBAcademy, this function validates the configuration of the pipeline"
import json
from dbacademy.dbhelper import ClustersHelper

pipeline_name, path = self.get_pipeline_config()

Expand Down Expand Up @@ -98,13 +119,20 @@ def validate_pipeline_config(self):
spark_master = configuration.get("spark.master")
assert spark_master == f"local[*]", f"Invalid spark.master value. Expected \"local[*]\", found \"{spark_master}\"."

cluster_count = len(spec.get("clusters"))
assert cluster_count == 1, f"Expected one, and only one, cluster configuration, found {cluster_count}. You can use the JSON UI to edit the configuration and remove the extra clusters."

cluster = spec.get("clusters")[0]
autoscale = cluster.get("autoscale")
assert autoscale is None, f"Autoscaling should be disabled."

num_workers = cluster.get("num_workers")
assert num_workers == 0, f"Expected the number of workers to be 0, found {num_workers}."

policy_id = cluster.get("policy_id")
policy_name = None if policy_id is None else self.client.cluster_policies.get_by_id(policy_id).get("name")
assert policy_id == self.get_dlt_policy().get("policy_id"), f"Expected the policy to be set to \"{ClustersHelper.POLICY_DLT_ONLY}\", found \"{policy_name}\"."

development = spec.get("development")
assert development == True, f"The pipline mode should be set to \"Development\"."

Expand All @@ -117,24 +145,8 @@ def validate_pipeline_config(self):
continuous = spec.get("continuous")
assert continuous == False, f"Expected the Pipeline mode to be \"Triggered\", found \"Continuous\"."

policy = self.client.cluster_policies.get_by_name("Student's DLT-Only Policy")
if policy is not None:
cluster = {
"num_workers": 0,
"label": "default",
"policy_id": policy.get("policy_id")
}
self.client.pipelines.create_or_update(name = pipeline_name,
storage = DA.paths.storage_location,
target = DA.schema_name,
notebooks = [path],
configuration = {
"spark.master": "local[*]",
"datasets_path": DA.paths.datasets,
},
clusters=[cluster])
print("All tests passed!")


# COMMAND ----------

Expand Down
Loading

0 comments on commit 6b8933a

Please sign in to comment.