Add blacken-docs to pre-commit linter (kedro-org#683)

bameka2000 · Feb 17, 2021 · a779307 · a779307
1 parent 397d71b
commit a779307
Show file tree

Hide file tree

Showing 24 changed files with 201 additions and 194 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -34,6 +34,12 @@ repos:
                   ^features/steps/test_starter/
                 )
 
+    - repo: https://github.com/asottile/blacken-docs
+      rev: v1.9.2
+      hooks:
+        - id: blacken-docs
+          entry: blacken-docs --skip-errors
+
     - repo: local
       hooks:
           - id: isort

diff --git a/RELEASE.md b/RELEASE.md
@@ -14,9 +14,9 @@
 * `kedro pipeline pull` now uses `pip download` for protocols that are not supported by `fsspec`.
 * Cleaned up documentation to fix broken links and rewrite permanently redirected ones.
 * Added a `jsonschema` schema definition for the Kedro 0.17 catalog.
-* Bumped up maximum required versions for the following packages: `sphinx` (3.5), `pyarrow` (4.0).
 * `kedro install` now waits on Windows until all the requirements are installed.
 * Fixed a bug where `ParquetDataSet` wasn't creating parent directories on the fly.
+* Added `blacken-docs` pre-commit linter to ensure all snippets in the documentation are `black`ed.
 
 ## Breaking changes to the API
 
@@ -440,7 +440,7 @@ As an example, code that used to look like this with the `Pipeline.transform()`
 ```python
 result = my_pipeline.transform(
     datasets={"input": "new_input", "output": "new_output", "params:x": "params:y"},
-    prefix="pre"
+    prefix="pre",
 )
 ```
 
@@ -453,7 +453,7 @@ result = pipeline(
     inputs={"input": "new_input"},
     outputs={"output": "new_output"},
     parameters={"params:x": "params:y"},
-    namespace="pre"
+    namespace="pre",
 )
 ```
 

diff --git a/docs/source/02_get_started/03_hello_kedro.md b/docs/source/02_get_started/03_hello_kedro.md
@@ -21,9 +21,7 @@ def return_greeting():
     return "Hello"
 
 
-return_greeting_node = node(
-    func=return_greeting, inputs=None, outputs="my_salutation"
-)
+return_greeting_node = node(func=return_greeting, inputs=None, outputs="my_salutation")
 ```
 
 The `join_statements` function is wrapped by a node called `join_statements_node`, which names a single input (`my_salutation`) and a single output (`my_message`):
@@ -97,9 +95,7 @@ def return_greeting():
     return "Hello"
 
 
-return_greeting_node = node(
-    return_greeting, inputs=None, outputs="my_salutation"
-)
+return_greeting_node = node(return_greeting, inputs=None, outputs="my_salutation")
 
 # Prepare second node
 def join_statements(greeting):

diff --git a/docs/source/03_tutorial/04_create_pipelines.md b/docs/source/03_tutorial/04_create_pipelines.md
@@ -40,10 +40,10 @@ def _parse_money(x):
 def preprocess_companies(companies: pd.DataFrame) -> pd.DataFrame:
     """Preprocess the data for companies.
 
-        Args:
-            companies: Source data.
-        Returns:
-            Preprocessed data.
+    Args:
+        companies: Source data.
+    Returns:
+        Preprocessed data.
 
     """
 
@@ -57,10 +57,10 @@ def preprocess_companies(companies: pd.DataFrame) -> pd.DataFrame:
 def preprocess_shuttles(shuttles: pd.DataFrame) -> pd.DataFrame:
     """Preprocess the data for shuttles.
 
-        Args:
-            shuttles: Source data.
-        Returns:
-            Preprocessed data.
+    Args:
+        shuttles: Source data.
+    Returns:
+        Preprocessed data.
 
     """
     shuttles["d_check_complete"] = shuttles["d_check_complete"].apply(_is_true)
@@ -231,18 +231,17 @@ The next step in the tutorial is to add another node for a function to join toge
 <summary><b>Click to expand</b></summary>
 
 ```python
-
 def create_master_table(
     shuttles: pd.DataFrame, companies: pd.DataFrame, reviews: pd.DataFrame
 ) -> pd.DataFrame:
     """Combines all data to create a master table.
 
-        Args:
-            shuttles: Preprocessed data for shuttles.
-            companies: Preprocessed data for companies.
-            reviews: Source data for reviews.
-        Returns:
-            Master table.
+    Args:
+        shuttles: Preprocessed data for shuttles.
+        companies: Preprocessed data for companies.
+        reviews: Source data for reviews.
+    Returns:
+        Master table.
 
     """
     rated_shuttles = shuttles.merge(reviews, left_on="id", right_on="shuttle_id")
@@ -362,12 +361,12 @@ from sklearn.model_selection import train_test_split
 def split_data(data: pd.DataFrame, parameters: Dict) -> List:
     """Splits data into training and test sets.
 
-        Args:
-            data: Source data.
-            parameters: Parameters defined in parameters.yml.
+    Args:
+        data: Source data.
+        parameters: Parameters defined in parameters.yml.
 
-        Returns:
-            A list containing split data.
+    Returns:
+        A list containing split data.
 
     """
     X = data[
@@ -390,12 +389,12 @@ def split_data(data: pd.DataFrame, parameters: Dict) -> List:
 def train_model(X_train: np.ndarray, y_train: np.ndarray) -> LinearRegression:
     """Train the linear regression model.
 
-        Args:
-            X_train: Training data of independent features.
-            y_train: Training data for price.
+    Args:
+        X_train: Training data of independent features.
+        y_train: Training data for price.
 
-        Returns:
-            Trained model.
+    Returns:
+        Trained model.
 
     """
     regressor = LinearRegression()
@@ -406,10 +405,10 @@ def train_model(X_train: np.ndarray, y_train: np.ndarray) -> LinearRegression:
 def evaluate_model(regressor: LinearRegression, X_test: np.ndarray, y_test: np.ndarray):
     """Calculate the coefficient of determination and log the result.
 
-        Args:
-            regressor: Trained model.
-            X_test: Testing data of independent features.
-            y_test: Testing data for price.
+    Args:
+        regressor: Trained model.
+        X_test: Testing data of independent features.
+        y_test: Testing data for price.
 
     """
     y_pred = regressor.predict(X_test)

diff --git a/docs/source/04_kedro_project_setup/02_configuration.md b/docs/source/04_kedro_project_setup/02_configuration.md
@@ -107,10 +107,7 @@ The contents of the dictionary resulting from `globals_pattern` get merged with
     "bucket_name": "another_bucket_name",
     "non_string_key": 10,
     "key_prefix": "my/key/prefix",
-    "datasets": {
-        "csv": "pandas.CSVDataSet",
-        "spark": "spark.SparkDataSet"
-    },
+    "datasets": {"csv": "pandas.CSVDataSet", "spark": "spark.SparkDataSet"},
     "folders": {
         "raw": "01_raw",
         "int": "02_intermediate",

diff --git a/docs/source/04_kedro_project_setup/03_session.md b/docs/source/04_kedro_project_setup/03_session.md
@@ -45,6 +45,7 @@ When you want to access to the most recent session object, use a helper function
 
 ```python
 from kedro.framework.session import get_current_session
+
 session = get_current_session()
 context = session.load_context()
 context.catalog.load("my_data").head()

diff --git a/docs/source/05_data/01_data_catalog.md b/docs/source/05_data/01_data_catalog.md
@@ -342,7 +342,12 @@ In the example above `catalog.yml` contains references to credentials keys `dev_
 ```python
 CSVDataSet(
     filepath="s3://test_bucket/data/02_intermediate/company/motorbikes.csv",
-    load_args=dict(sep=",", skiprows=5, skipfooter=1, na_values=["#NA", "NA"],),
+    load_args=dict(
+        sep=",",
+        skiprows=5,
+        skipfooter=1,
+        na_values=["#NA", "NA"],
+    ),
     credentials=dict(key="token", secret="key"),
 )
 ```
@@ -465,16 +470,15 @@ Transformers are applied at the `DataCatalog` level. To apply the built-in `Prof
 ```python
 # src/<package_name>/hooks.py
 
-from kedro.extras.transformers import ProfileTimeTransformer # new import
-from kedro.framework.hooks import hook_impl # new import
-from kedro.io import DataCatalog # new import
+from kedro.extras.transformers import ProfileTimeTransformer  # new import
+from kedro.framework.hooks import hook_impl  # new import
+from kedro.io import DataCatalog  # new import
 
 
 class TransformerHooks:
     @hook_impl
     def after_catalog_created(self, catalog: DataCatalog) -> None:
         catalog.add_transformer(ProfileTimeTransformer())
-
 ```
 
 ```python
@@ -557,9 +561,7 @@ from kedro.extras.datasets.pandas import (
 io = DataCatalog(
     {
         "bikes": CSVDataSet(filepath="../data/01_raw/bikes.csv"),
-        "cars": CSVDataSet(
-            filepath="../data/01_raw/cars.csv", load_args=dict(sep=",")
-        ),
+        "cars": CSVDataSet(filepath="../data/01_raw/cars.csv", load_args=dict(sep=",")),
         "cars_table": SQLTableDataSet(
             table_name="cars", credentials=dict(con="sqlite:///kedro.db")
         ),

diff --git a/docs/source/05_data/02_kedro_io.md b/docs/source/05_data/02_kedro_io.md
@@ -59,7 +59,7 @@ from kedro.io import AbstractVersionedDataSet
 
 
 class MyOwnDataSet(AbstractVersionedDataSet):
-    def __init__(self, filepath, version,  param1, param2=True):
+    def __init__(self, filepath, version, param1, param2=True):
         super().__init__(PurePosixPath(filepath), version)
         self._param1 = param1
         self._param2 = param2
@@ -149,8 +149,9 @@ version = Version(
 )
 
 test_data_set = CSVDataSet(
-    filepath="data/01_raw/test.csv", save_args={"index": False}, version=version,
-
+    filepath="data/01_raw/test.csv",
+    save_args={"index": False},
+    version=version,
 )
 io = DataCatalog({"test_data_set": test_data_set})
 
@@ -173,8 +174,9 @@ version = Version(
 )
 
 test_data_set = CSVDataSet(
-    filepath="data/01_raw/test.csv", save_args={"index": False}, version=version,
-
+    filepath="data/01_raw/test.csv",
+    save_args={"index": False},
+    version=version,
 )
 io = DataCatalog({"test_data_set": test_data_set})
 
@@ -198,8 +200,9 @@ version = Version(
 )
 
 test_data_set = CSVDataSet(
-    filepath="data/01_raw/test.csv", save_args={"index": False}, version=version,
-
+    filepath="data/01_raw/test.csv",
+    save_args={"index": False},
+    version=version,
 )
 io = DataCatalog({"test_data_set": test_data_set})
 

diff --git a/docs/source/06_nodes_and_pipelines/02_pipeline_introduction.md b/docs/source/06_nodes_and_pipelines/02_pipeline_introduction.md
@@ -140,11 +140,13 @@ nodes
 The output is as follows:
 
 ```python
-[Node(len, 'xs', 'n', None),
- Node(mean, ['xs', 'n'], 'm', 'mean_node'),
- Node(mean_sos, ['xs', 'n'], 'm2', 'mean_sos'),
- Node(variance, ['m', 'm2'], 'v', 'variance node')]
- ```
+[
+    Node(len, "xs", "n", None),
+    Node(mean, ["xs", "n"], "m", "mean_node"),
+    Node(mean_sos, ["xs", "n"], "m2", "mean_sos"),
+    Node(variance, ["m", "m2"], "v", "variance node"),
+]
+```
 
 To find out about the inputs:
 
@@ -155,7 +157,7 @@ nodes[0].inputs
 You should see the following:
 
 ```python
-['xs']
+["xs"]
 ```
 </details>
 

diff --git a/docs/source/06_nodes_and_pipelines/03_modular_pipelines.md b/docs/source/06_nodes_and_pipelines/03_modular_pipelines.md
@@ -273,10 +273,17 @@ Consider this example:
 
 ```python
 cook_pipeline = Pipeline(
-    [node(defrost, "frozen_meat", "meat"), node(grill, "meat", "grilled_meat"),]
+    [
+        node(defrost, "frozen_meat", "meat"),
+        node(grill, "meat", "grilled_meat"),
+    ]
 )
 
-lunch_pipeline = Pipeline([node(eat, "food", None),])
+lunch_pipeline = Pipeline(
+    [
+        node(eat, "food", None),
+    ]
+)
 ```
 
 A simple `cook_pipeline + lunch_pipeline` doesn't work, because the `grilled_meat` output in the `cook_pipeline` needs to be mapped to the `food` input in the `lunch_pipeline`. This can be done in any of the following three (equivalent) ways:
@@ -302,12 +309,14 @@ final_pipeline3 = pipeline(
 Remember you can pass `Pipeline` objects in the constructor as well, like in the example below. This approach is cleaner and more idiomatic when you are combining multiple modular pipelines together.
 
 ```python
-final_pipeline = Pipeline([
-    pipeline(cook_pipeline, outputs={"grilled_meat": "new_name"}),
-    pipeline(lunch_pipeline, inputs={"food": "new_name"}),
-    node(...),
-    ...
-])
+final_pipeline = Pipeline(
+    [
+        pipeline(cook_pipeline, outputs={"grilled_meat": "new_name"}),
+        pipeline(lunch_pipeline, inputs={"food": "new_name"}),
+        node(...),
+        ...,
+    ]
+)
 ```
 
 >*Note:* `inputs` should correspond to the pipeline free inputs, while `outputs` are either free or intermediary outputs.
@@ -336,13 +345,13 @@ cook_breakfast_pipeline = pipeline(
     cook_pipeline,
     inputs="frozen_meat",  # inputs stay the same, don't namespace
     outputs={"grilled_meat": "breakfast_food"},
-    namespace="breakfast"
+    namespace="breakfast",
 )
 cook_lunch_pipeline = pipeline(
     cook_pipeline,
     inputs="frozen_meat",  # inputs stay the same, don't namespace
     outputs={"grilled_meat": "lunch_food"},
-    namespace="lunch"
+    namespace="lunch",
 )
 
 final_pipeline = (
@@ -374,15 +383,17 @@ final_pipeline = pipeline(raw_pipeline, namespace="new")
 You can map parameter values in a similar way to inputs and outputs. Let's say you have two almost identical pipelines that differ by one parameter. You want to run the pipelines on the same set of inputs.
 
 ```python
-alpha_pipeline = Pipeline([
-    node(node_func1, ["input1", "input2", "params:alpha"], "intermediary_output"),
-    node(node_func2, "intermediary_output", "output")
-])
+alpha_pipeline = Pipeline(
+    [
+        node(node_func1, ["input1", "input2", "params:alpha"], "intermediary_output"),
+        node(node_func2, "intermediary_output", "output"),
+    ]
+)
 beta_pipeline = pipeline(
     alpha_pipeline,
     inputs={"input1", "input2"},
     parameters={"params:alpha": "params:beta"},
-    namespace="beta"
+    namespace="beta",
 )
 
 final_pipeline = alpha_pipeline + beta_pipeline

diff --git a/docs/source/06_nodes_and_pipelines/04_run_a_pipeline.md b/docs/source/06_nodes_and_pipelines/04_run_a_pipeline.md
@@ -152,7 +152,6 @@ from kedro.framework.hooks import hook_impl
 
 
 class ProjectHooks:
-
     @hook_impl
     def register_pipelines(self):
         """Register the project's pipelines.