Merge pull request FederatedAI#4282 from FederatedAI/feature-1.9.0-rm…

…_dataio add warning in dataio using and modify doc
ToJ112 · Aug 26, 2022 · 257316e · 257316e
2 parents 90e6b0a + a66a6be
commit 257316e
Show file tree

Hide file tree

Showing 12 changed files with 91 additions and 113 deletions.
diff --git a/doc/api/fate_client/pipeline_component.md b/doc/api/fate_client/pipeline_component.md
@@ -19,11 +19,11 @@ components' input, check the [list](../../federatedml_component/README.md).
 Here is an example to access a component's input:
 
 ``` sourceCode python
-from pipeline.component import DataIO
-dataio_0 = DataIO(name="dataio_0")
-input_all = dataio_0.input
-input_data = dataio_0.input.data
-input_model = dataio_0.input.model
+from pipeline.component import DataTransform
+data_transform_0 = DataTransform(name="data_transform_0")
+input_all = data_transform_0.input
+input_data = data_transform_0.input.data
+input_model = data_transform_0.input.model
 ```
 
 ### Output
@@ -37,11 +37,11 @@ information on each components' output, check the
 Here is an example to access a component's output:
 
 ``` sourceCode python
-from pipeline.component import DataIO
-dataio_0 = DataIO(name="dataio_0")
-output_all = dataio_0.output
-output_data = dataio_0.output.data
-output_model = dataio_0.output.model
+from pipeline.component import DataTransform
+data_transform_0 = DataTransform(name="data_transform_0")
+output_all = data_transform_0.output
+output_data = data_transform_0.output.data
+output_model = data_transform_0.output.model
 ```
 
 Meanwhile, to download components' output table or model, please use
@@ -51,12 +51,12 @@ Meanwhile, to download components' output table or model, please use
 
 In most cases, data sets are wrapped into `data` when being passed
 between modules. For instance, in the [mini
-demo](../../../examples/pipeline/demo/pipeline-mini-demo.py), data output of `dataio_0` is set
+demo](../../../examples/pipeline/demo/pipeline-mini-demo.py), data output of `data_transform_0` is set
 as data input to
 `intersection_0`.
 
 ``` sourceCode python
-pipeline.add_component(intersection_0, data=Data(data=dataio_0.output.data))
+pipeline.add_component(intersection_0, data=Data(data=data_transform_0.output.data))
 ```
 
 For data sets used in different modeling stages (e.g., train & validate)
@@ -96,7 +96,7 @@ pipeline.add_component(hetero_lr_1,
 To run prediction with with new data, data source needs to be updated in
 prediction job. Below is an example from [mini
 demo](../../../examples/pipeline/demo/pipeline-mini-demo.py), where data input of original
-<span class="title-ref">dataio\_0</span> component is set to be the data
+<span class="title-ref">data\_transform\_0</span> component is set to be the data
 output from <span class="title-ref">reader\_2</span>.
 
 ``` sourceCode python
@@ -106,7 +106,7 @@ reader_2.get_party_instance(role="host", party_id=host).component_param(table=ho
 # add data reader onto predict pipeline
 predict_pipeline.add_component(reader_2)
 predict_pipeline.add_component(pipeline,
-                               data=Data(predict_input={pipeline.dataio_0.input.data: reader_2.output.data}))
+                               data=Data(predict_input={pipeline.data_transform_0.input.data: reader_2.output.data}))
 ```
 
 Below lists all five types of `data` and whether `Input` and `Output`
@@ -131,17 +131,17 @@ input and output:
 
 ``` sourceCode python
 from pipeline.backend.pipeline import Pipeline
-from pipeline.component import DataIO, Intersection, HeteroDataSplit, HeteroLR
+from pipeline.component import DataTransform, Intersection, HeteroDataSplit, HeteroLR
 # initialize a pipeline
 pipeline = PipeLine().set_initiator(role='guest', party_id=guest).set_roles(guest=guest)
 # define all components
-dataio_0 = DataIO(name="dataio_0")
+data_transform_0 = DataTransform(name="data_transform_0")
 data_split = HeteroDataSplit(name="data_split_0")
 hetero_lr_0 = HeteroLR(name="hetero_lr_0", max_iter=20)
 # chain together all components
 pipeline.add_component(reader_0)
-pipeline.add_component(dataio_0, data=Data(data=reader_0.output.data))
-pipeline.add_component(intersection_0, data=Data(data=dataio_0.output.data))
+pipeline.add_component(data_transform_0, data=Data(data=reader_0.output.data))
+pipeline.add_component(intersection_0, data=Data(data=data_transform_0.output.data))
 pipeline.add_component(hetero_data_split_0, data=Data(data=intersection_0.output.data))
 pipeline.add_component(hetero_lr_0, data=Data(train_data=hetero_data_split_0.output.data.train_data,
                                               validate_data=hetero_data_split_0.output.data.test_data))
@@ -156,13 +156,13 @@ parameters from the previous component. When a model from previous
 component is used as input but the current component is of different
 class from the previous component, `isometric_model` is used.
 
-Check below for a case from mini demo, where `model` from `dataio_0` is
-passed to `dataio_1`.
+Check below for a case from mini demo, where `model` from `data_transform_0` is
+passed to `data_transform_1`.
 
 ``` sourceCode python
-pipeline.add_component(dataio_1,
+pipeline.add_component(data_transform_1,
                        data=Data(data=reader_1.output.data),
-                       model=Model(dataio_0.output.model))
+                       model=Model(data_transform_0.output.model))
 ```
 
 Here is a case of using `isometric model`. `HeteroFeatureSelection` uses
@@ -200,7 +200,7 @@ Below code sets cache output from `intersection_0` as cache input of
 `intersection_1`.
 
 ``` sourceCode python
-pipeline.add_component(intersection_1, data=Data(data=dataio_0.output.data), cache=Cache(intersect_0.output.cache))
+pipeline.add_component(intersection_1, data=Data(data=data_transform_0.output.data), cache=Cache(intersect_0.output.cache))
 ```
 
 To load cache from another job, use `CacheLoader` component. In this
@@ -210,7 +210,7 @@ input.
 
 ``` sourceCode python
 pipeline.add_component(cache_loader_0)
-pipeline.add_component(intersect_0, data=Data(data=dataio_0.output.data), cache=Cache(cache_loader_0.output.cache))
+pipeline.add_component(intersect_0, data=Data(data=data_transform_0.output.data), cache=Cache(cache_loader_0.output.cache))
 ```
 
 ### Parameter
@@ -224,8 +224,8 @@ per individual participant.
 <!-- end list -->
 
 ``` sourceCode python
-from pipeline.component import DataIO
-dataio_0 = DataIO(name="dataio_0", input_format="dense", output_format="dense",
+from pipeline.component import DataTransform
+data_transform_0 = DataTransform(name="data_transform_0", input_format="dense", output_format="dense",
                   outlier_replace=False)
 ```
 
@@ -234,11 +234,11 @@ dataio_0 = DataIO(name="dataio_0", input_format="dense", output_format="dense",
 <!-- end list -->
 
 ``` sourceCode python
-# set guest dataio_0 component parameters
-guest_dataio_0 = dataio_0.get_party_instance(role='guest', party_id=9999)
-guest_dataio_0.component_param(with_label=True)
-# set host dataio_0 component parameters
-dataio_0.get_party_instance(role='host', party_id=10000).component_param(with_label=False)
+# set guest data_transform_0 component parameters
+guest_data_transform_0 = data_transform_0.get_party_instance(role='guest', party_id=9999)
+guest_data_transform_0.component_param(with_label=True)
+# set host data_transform_0 component parameters
+data_transform_0.get_party_instance(role='host', party_id=10000).component_param(with_label=False)
 ```
 
 ### Task Info
@@ -258,6 +258,6 @@ To obtain output of a component, the component needs to be first
 extracted from pipeline:
 
 ``` sourceCode python
-print(pipeline.get_component("dataio_0").get_output_data(limits=10))
+print(pipeline.get_component("data_transform_0").get_output_data(limits=10))
 ```
 
diff --git a/doc/develop/develop_guide.md b/doc/develop/develop_guide.md
@@ -249,7 +249,7 @@ In this section, we describe how to do Step 2-5. Many common interfaces are prov
     def fit(self, train_data, validate_data):
     ```
 
-    Both `train_data` and `validate_data` (optional) are Tables from upstream components(DataIO for example). 
+    Both `train_data` and `validate_data` (optional) are Tables from upstream components(DataTransform for example). 
     The `fit` method is the entry point to launch the training of the modeling component or the feature engineering component.
     When starting a training task, this method will be called by `model_base` automatically.
 

diff --git a/doc/federatedml_component/union.md b/doc/federatedml_component/union.md
@@ -1,7 +1,7 @@
 # Union
 
 Union module combines given tables into one while keeping unique entry
-ids. Union is a local module. Like DataIO, this module can be run on the
+ids. Union is a local module. Like DataTransform, this module can be run on the
 side of Host or Guest, and running this module does not require any
 interaction with outside parties.
 
@@ -26,7 +26,7 @@ with FATE-Pipeline:
             "module": "Union",
             "input": {
                 "data": {
-                        "data": ["dataio_0.data", "dataio_1.data", "dataio_2.data"]
+                        "data": ["data_transform_0.data", "data_transform_1.data", "data_transform_2.data"]
                 }
             },
             "output": {
@@ -44,7 +44,7 @@ with DSL v2:
             "module": "Union",
             "input": {
                 "data": {
-                        "data": ["dataio_0.data", "dataio_1.data", "dataio_2.data"]
+                        "data": ["data_transform_0.data", "data_transform_1.data", "data_transform_2.data"]
                 }
             },
             "output": {
@@ -55,24 +55,24 @@ with DSL v2:
 ```
 
 Upstream tables will enter Union module in this order:
-<span class="title-ref">dataio\_0.data</span>,
-<span class="title-ref">dataio\_1.data</span>,
-<span class="title-ref">dataio\_2.data</span> .
+<span class="title-ref">data\_transform\_0.data</span>,
+<span class="title-ref">data\_transform\_1.data</span>,
+<span class="title-ref">data\_transform\_2.data</span> .
 
 If an id <span class="title-ref">42</span> exists in both
-<span class="title-ref">dataio\_0.data</span> and
-<span class="title-ref">dataio\_1.data</span>, and:
+<span class="title-ref">data\_transform\_0.data</span> and
+<span class="title-ref">data\_transform\_1.data</span>, and:
 
 1.  'keep\_duplicate\` set to false: the value from
-    <span class="title-ref">dataio\_0.data</span> is the one being kept
+    <span class="title-ref">data\_transform\_0.data</span> is the one being kept
     in the final result, its id unchanged.
 2.  'keep\_duplicate\` set to true: the value from
-    <span class="title-ref">dataio\_0.data</span> and the one from
-    <span class="title-ref">dataio\_1.data</span> are both kept; the id
-    in <span class="title-ref">dataio\_0.data</span> will be transformed
-    to <span class="title-ref">42\_dataio\_0</span>, and the id in
-    <span class="title-ref">dataio\_1.data</span> to
-    <span class="title-ref">42\_dataio\_1</span>.
+    <span class="title-ref">data\_transform\_0.data</span> and the one from
+    <span class="title-ref">data\_transform\_1.data</span> are both kept; the id
+    in <span class="title-ref">data\_transform\_0.data</span> will be transformed
+    to <span class="title-ref">42\_data\_transform\_0</span>, and the id in
+    <span class="title-ref">data\_transform\_1.data</span> to
+    <span class="title-ref">42\_data\_transform\_1</span>.
 
 
 <!-- mkdocs

diff --git a/doc/tutorial/dsl_conf/dsl_conf_tutorial.md b/doc/tutorial/dsl_conf/dsl_conf_tutorial.md
@@ -59,7 +59,7 @@ $ flow job config -j 2020103015490073208469 -r guest -p 9999 -o ./
 We use flow_client to deploy components needed in the prediction task:
 
 ```sh
-$ flow model deploy --model-id guest-10000#host-10000#model --model-version 2020103015490073208469 --cpn-list "dataio_0, intersection_0, hetero_secure_boost_0"
+$ flow model deploy --model-id guest-10000#host-10000#model --model-version 2020103015490073208469 --cpn-list "data_transform_0, intersection_0, hetero_secure_boost_0"
 ```
 
 We can modify existing predict conf by replacing model_id, model_version and data set name with yours to make a new 

diff --git a/examples/benchmark_performance/hetero_sbt/test_hetero_secureboost_train_job_conf.json b/examples/benchmark_performance/hetero_sbt/test_hetero_secureboost_train_job_conf.json
@@ -118,7 +118,7 @@
                             "namespace": "performance"
                         }
                     },
-                    "dataio_0": {
+                    "data_transform_0": {
                         "missing_fill": true,
                         "missing_fill_method": "mean",
                         "outlier_replace": false,
@@ -130,7 +130,7 @@
                         "label_type": "int",
                         "output_format": "dense"
                     },
-                    "dataio_1": {
+                    "data_transform_1": {
                         "missing_fill": true,
                         "missing_fill_method": "mean",
                         "outlier_replace": false,
@@ -158,14 +158,14 @@
                             "namespace": "performance"
                         }
                     },
-                    "dataio_0": {
+                    "data_transform_0": {
                         "input_format": "tag",
                         "delimitor": ";",
                         "tag_with_value": true,
                         "with_label": false,
                         "output_format": "dense"
                     },
-                    "dataio_1": {
+                    "data_transform_1": {
                         "input_format": "tag",
                         "delimitor": ";",
                         "tag_with_value": true,

diff --git a/examples/benchmark_performance/hetero_sbt/test_hetero_secureboost_train_job_dsl.json b/examples/benchmark_performance/hetero_sbt/test_hetero_secureboost_train_job_dsl.json
@@ -16,8 +16,8 @@
                 ]
             }
         },
-        "dataio_0": {
-            "module": "DataIO",
+        "data_transform_0": {
+            "module": "DataTransform",
             "input": {
                 "data": {
                     "data": [
@@ -34,16 +34,16 @@
                 ]
             }
         },
-        "dataio_1": {
-            "module": "DataIO",
+        "data_transform_1": {
+            "module": "DataTransform",
             "input": {
                 "data": {
                     "data": [
                         "reader_1.data"
                     ]
                 },
                 "model": [
-                    "dataio_0.model"
+                    "data_transform_0.model"
                 ]
             },
             "output": {
@@ -60,7 +60,7 @@
             "input": {
                 "data": {
                     "data": [
-                        "dataio_0.data"
+                        "data_transform_0.data"
                     ]
                 }
             },
@@ -75,7 +75,7 @@
             "input": {
                 "data": {
                     "data": [
-                        "dataio_1.data"
+                        "data_transform_1.data"
                     ]
                 }
             },

diff --git a/python/fate_client/pipeline/component/dataio.py b/python/fate_client/pipeline/component/dataio.py
@@ -27,6 +27,7 @@ def __init__(self, **kwargs):
 
         #print (self.name)
         LOGGER.debug(f"{self.name} component created")
+        LOGGER.warning("DataIO should not be use in training task since FATE-v1.9.0, use DataTransform instead")
         new_kwargs = self.erase_component_base_param(**kwargs)
 
         DataIOParam.__init__(self, **new_kwargs)