Merge pull request #1 from apache/master

同步 hudi master
loukey-lj · Jan 26, 2021 · ee9ae14 · ee9ae14
2 parents 56866a1 + c4afd17
commit ee9ae14
Show file tree

Hide file tree

Showing 526 changed files with 22,236 additions and 3,229 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -20,10 +20,12 @@ jdk:
   - openjdk8
 jobs:
   include:
-    - name: "Unit tests except hudi-spark-client"
-      env: MODE=unit MODULES='!hudi-client/hudi-spark-client' HUDI_QUIETER_LOGGING=1
     - name: "Unit tests for hudi-spark-client"
       env: MODE=unit MODULES=hudi-client/hudi-spark-client HUDI_QUIETER_LOGGING=1
+    - name: "Unit tests for hudi-utilities"
+      env: MODE=unit MODULES=hudi-utilities HUDI_QUIETER_LOGGING=1
+    - name: "All other unit tests"
+      env: MODE=unit MODULES='!hudi-utilities,!hudi-client/hudi-spark-client' HUDI_QUIETER_LOGGING=1
     - name: "Functional tests"
       env: MODE=functional HUDI_QUIETER_LOGGING=1
     - name: "Integration tests"

diff --git a/LICENSE b/LICENSE
@@ -246,6 +246,8 @@ This product includes code from Apache Spark
 
 * org.apache.hudi.AvroConversionHelper copied from classes in org/apache/spark/sql/avro package
 
+* org.apache.hudi.HoodieSparkUtils.scala copied some methods from org.apache.spark.deploy.SparkHadoopUtil.scala
+
 Copyright: 2014 and onwards The Apache Software Foundation
 Home page: http://spark.apache.org/
 License: http://www.apache.org/licenses/LICENSE-2.0

diff --git a/README.md b/README.md
@@ -76,6 +76,14 @@ The default Scala version supported is 2.11. To build for Scala 2.12 version, bu
 mvn clean package -DskipTests -Dscala-2.12
 ```
 
+### Build with Spark 3.0.0
+
+The default Spark version supported is 2.4.4. To build for Spark 3.0.0 version, build using `spark3` profile
+
+```
+mvn clean package -DskipTests -Dspark3
+```
+
 ### Build without spark-avro module
 
 The default hudi-jar bundles spark-avro module. To build without spark-avro module, build using `spark-shade-unbundle-avro` profile

diff --git a/docker/demo/config/test-suite/complex-dag-cow.yaml b/docker/demo/config/test-suite/complex-dag-cow.yaml
@@ -13,122 +13,56 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-first_insert:
-  config:
-    record_size: 70000
-    num_insert_partitions: 1
-    repeat_count: 1
-    num_records_insert: 1000
-  type: InsertNode
-  deps: none
-second_insert:
-  config:
-    record_size: 70000
-    num_insert_partitions: 1
-    repeat_count: 1
-    num_records_insert: 10000
-  deps: first_insert
-  type: InsertNode
-third_insert:
-  config:
-    record_size: 70000
-    num_insert_partitions: 1
-    repeat_count: 1
-    num_records_insert: 300
-  deps: second_insert
-  type: InsertNode
-first_rollback:
-  config:
-  deps: third_insert
-  type: RollbackNode
-first_upsert:
-  config:
-    record_size: 70000
-    num_insert_partitions: 1
-    num_records_insert: 300
-    repeat_count: 1
-    num_records_upsert: 100
-    num_upsert_partitions: 10
-  type: UpsertNode
-  deps: first_rollback
-first_hive_sync:
-  config:
-    queue_name: "adhoc"
-    engine: "mr"
-  type: HiveSyncNode
-  deps: first_upsert
-first_hive_query:
-  config:
-    queue_name: "adhoc"
-    engine: "mr"
-    hive_queries:
-      query1: "select count(*) from testdb.table1 group by `_row_key` having count(*) > 1"
-      result1: 0
-      query2: "select count(*) from testdb.table1"
-      result2: 11300
-  type: HiveQueryNode
-  deps: first_hive_sync
-second_upsert:
-  config:
-    record_size: 70000
-    num_insert_partitions: 1
-    num_records_insert: 300
-    repeat_count: 1
-    num_records_upsert: 100
-    num_upsert_partitions: 10
-  type: UpsertNode
-  deps: first_hive_query
-second_hive_query:
-  config:
-    queue_name: "adhoc"
-    engine: "mr"
-    hive_queries:
-      query1: "select count(*) from testdb.table1 group by `_row_key` having count(*) > 1"
-      result1: 0
-      query2: "select count(*) from testdb.table1"
-      result2: 11600
-  type: HiveQueryNode
-  deps: second_upsert
-fourth_insert:
-  config:
-    record_size: 70000
-    num_insert_partitions: 1
-    repeat_count: 1
-    num_records_insert: 1000
-  deps: second_hive_query
-  type: InsertNode
-third_hive_query:
-  config:
-    queue_name: "adhoc"
-    engine: "mr"
-    hive_queries:
-      query1: "select count(*) from testdb.table1 group by `_row_key` having count(*) > 1"
-      result1: 0
-      query2: "select count(*) from testdb.table1"
-      result2: 12600
-  type: HiveQueryNode
-  deps: fourth_insert
-first_delete:
-  config:
-    record_size: 70000
-    num_partitions_delete: 1
-    num_records_delete: 200
-  deps: third_hive_query
-  type: DeleteNode
-fourth_hive_sync:
-  config:
-    queue_name: "adhoc"
-    engine: "mr"
-  type: HiveSyncNode
-  deps: first_delete
-fourth_hive_query:
-  config:
-    queue_name: "adhoc"
-    engine: "mr"
-    hive_queries:
-      query1: "select count(*) from testdb.table1 group by `_row_key` having count(*) > 1"
-      result1: 0
-      query2: "select count(*) from testdb.table1"
-      result2: 12400
-  type: HiveQueryNode
-  deps: fourth_hive_sync
+dag_name: cow-long-running-example.yaml
+dag_rounds: 2
+dag_intermittent_delay_mins: 1
+dag_content:
+  first_insert:
+    config:
+      record_size: 100
+      num_partitions_insert: 1
+      repeat_count: 1
+      num_records_insert: 1000
+    type: InsertNode
+    deps: none
+  second_insert:
+    config:
+      record_size: 100
+      num_partitions_insert: 1
+      repeat_count: 1
+      num_records_insert: 10000
+    deps: first_insert
+    type: InsertNode
+  third_insert:
+    config:
+      record_size: 100
+      num_partitions_insert: 1
+      repeat_count: 1
+      num_records_insert: 300
+    deps: second_insert
+    type: InsertNode
+  first_validate:
+    config:
+    type: ValidateDatasetNode
+    deps: third_insert
+  first_upsert:
+    config:
+      record_size: 100
+      num_partitions_insert: 1
+      num_records_insert: 300
+      repeat_count: 1
+      num_records_upsert: 100
+      num_partitions_upsert: 1
+    type: UpsertNode
+    deps: first_validate
+  first_delete:
+    config:
+      num_partitions_delete: 1
+      num_records_delete: 2000
+    type: DeleteNode
+    deps: first_upsert
+  second_validate:
+    config:
+      delete_input_data: true
+    type: ValidateDatasetNode
+    deps: first_delete