editing query interface

Signed-off-by: tanmc123 <[email protected]>
jnmaomao · Dec 19, 2019 · ac4a795 · ac4a795
1 parent 004699e
commit ac4a795
Show file tree

Hide file tree

Showing 11 changed files with 415 additions and 11 deletions.
diff --git a/cluster-deploy/doc/Fate-cluster_deployment_guide_build_zh.md b/cluster-deploy/doc/Fate-cluster_deployment_guide_build_zh.md
@@ -277,7 +277,7 @@ cd FATE/cluster-deploy/scripts
 如果需要部署所有组件，执行：
 
 ```
-bash deploy_cluster_multinode.sh build all 
+bash deploy_cluster_multinode.sh build all
 ```
 
 如果只部署部分组件：

diff --git a/examples/__init__.py b/examples/__init__.py
@@ -0,0 +1,17 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+#
+#  Copyright 2019 The FATE Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
diff --git a/...ratedml-1.x-examples/hetero_feature_selection/test_hetero_feature_selection_job_conf.json b/...ratedml-1.x-examples/hetero_feature_selection/test_hetero_feature_selection_job_conf.json
@@ -78,10 +78,15 @@
       }
     },
     "hetero_feature_selection_0": {
-      "select_cols": -1,
-      "filter_methods": ["unique_value", "iv_value_thres",
+      "select_col_indexes": -1,
+      "select_names": [],
+      "filter_methods": ["manually", "unique_value", "iv_value_thres",
         "coefficient_of_variation_value_thres", "iv_percentile", "outlier_cols"],
       "local_only": false,
+      "manually_param": {
+        "filter_out_indexes": [],
+        "filter_out_names": []
+      },
       "unique_param": {
         "eps": 1e-6
       },

diff --git a/examples/federatedml-1.x-examples/quick_run.py b/examples/federatedml-1.x-examples/quick_run.py
@@ -33,8 +33,11 @@
 HOST = 'host'
 
 # You can set up your own configuration files here
-DSL_PATH = 'hetero_logistic_regression/test_hetero_lr_train_job_dsl.json'
-SUBMIT_CONF_PATH = 'hetero_logistic_regression/test_hetero_lr_train_job_conf.json'
+# DSL_PATH = 'hetero_logistic_regression/test_hetero_lr_train_job_dsl.json'
+# SUBMIT_CONF_PATH = 'hetero_logistic_regression/test_hetero_lr_train_job_conf.json'
+
+DSL_PATH = 'homo_logistic_regression/test_homolr_train_job_dsl.json'
+SUBMIT_CONF_PATH = 'homo_logistic_regression/test_homolr_train_job_conf.json'
 
 TEST_PREDICT_CONF = HOME_DIR + '/test_predict_conf.json'
 
@@ -43,10 +46,11 @@
 # TASK = 'predict'
 
 # Put your data to /examples/data folder and indicate the data names here
-GUEST_DATA_SET = 'breast_b.csv'
-HOST_DATA_SET = 'breast_a.csv'
-# GUEST_DATA_SET = 'homo_breast_train_guest.csv'
-# HOST_DATA_SET = 'homo_breast_train_host.csv'
+# GUEST_DATA_SET = 'breast_b.csv'
+# HOST_DATA_SET = 'breast_a.csv'
+GUEST_DATA_SET = 'default_credit_homo_guest.csv'
+HOST_DATA_SET = 'default_credit_homo_host.csv'
+
 
 # Define your party ids here
 GUEST_ID = 10000

diff --git a/examples/running_tools/data_processing/__init__.py b/examples/running_tools/data_processing/__init__.py
@@ -0,0 +1,17 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+#
+#  Copyright 2019 The FATE Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
diff --git a/examples/running_tools/data_processing/merge_data.py b/examples/running_tools/data_processing/merge_data.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+#
+#  Copyright 2019 The FATE Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import argparse
+import json
+import sys
+import traceback
+
+import pandas as pd
+
+from examples.running_tools import run_config
+from examples.running_tools.base_task import BaseTask
+
+# valid data set: "breast", "default_credit", "give_credit", "vehicle"
+file_path1 = '/data/projects/fate/python_mc/examples/data/dc.csv'
+file_path2 = '/data/projects/fate/python_mc/examples/data/training_data_phone.csv'
+
+f1_header = None
+f2_header = 0
+
+output_file_name = 'dc_data.csv'
+
+
+class MergeData(BaseTask):
+    def merge_hetero(self):
+        df1 = pd.read_csv(file_path1, index_col=0, header=f1_header)
+        print('df1', df1)
+        df2 = pd.read_csv(file_path2, index_col=0, header=f2_header)
+        print('df2', df2)
+        total_train_df = pd.concat([df2, df1], axis=0, sort=False, join='inner')
+        total_train_df.to_csv(run_config.TEMP_DATA_PATH + output_file_name)
+
+
+if __name__ == '__main__':
+    merge_data = MergeData()
+    merge_data.merge_hetero()
diff --git a/examples/running_tools/data_processing/split_upload.py b/examples/running_tools/data_processing/split_upload.py
@@ -0,0 +1,234 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+#
+#  Copyright 2019 The FATE Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import argparse
+import json
+import math
+import sys
+import time
+import traceback
+
+import pandas as pd
+
+from examples.running_tools import run_config
+from examples.running_tools.base_task import BaseTask
+
+# valid data set: "breast", "default_credit", "give_credit", "vehicle"
+DATA_SET = 'HomeCredit_homo_guest8'
+MODE = "hetero"
+PARTITION_NUM = 10
+
+ROLE = 'guest'
+
+TRAIN_RATIO = 0.8  # Ratio of training set, test set will be 1 - TRAIN_RATIO
+GUEST_RATIO = 0.5  # For hetero, means feature number ratio, for homo means sample ratio.
+HOST_RATIOS = [0.2, 0.3]
+
+need_generate_data = True
+split_train_test_only = True
+
+add_empty_column = False
+
+
+class DataTask(BaseTask):
+    @staticmethod
+    def get_file_names():
+        train_file_names = ['_'.join([MODE, DATA_SET, 'train', ROLE])]
+        test_file_names = ['_'.join([MODE, DATA_SET, 'test', ROLE])]
+
+        if not split_train_test_only:
+            for idx, _ in enumerate(HOST_RATIOS):
+                train_file_names.append('_'.join([MODE, DATA_SET, 'train', 'host', str(idx)]))
+                test_file_names.append('_'.join([MODE, DATA_SET, 'test', 'host', str(idx)]))
+
+        return train_file_names, test_file_names
+
+    @staticmethod
+    def __write_upload_conf(json_info, file_name):
+        partition_num = str(PARTITION_NUM) + 'p'
+
+        file_path = run_config.TEMP_DATA_PATH + file_name + '.csv'
+        config_file_path = run_config.TEMP_CONFIG_PATH + file_name + '.json'
+        name_space = '_'.join([MODE, DATA_SET])
+
+        json_info['file'] = file_path
+        json_info['table_name'] = '_'.join([file_name, partition_num])
+        json_info['namespace'] = name_space
+        json_info['work_mode'] = run_config.WORK_MODE
+        json_info['partition'] = PARTITION_NUM
+
+        config = json.dumps(json_info, indent=4)
+        with open(config_file_path, "w") as fout:
+            fout.write(config + "\n")
+        return config_file_path
+
+    def make_upload_conf(self):
+        original_upload_conf = run_config.UPLOAD_TEMPLATE
+        with open(original_upload_conf, 'r', encoding='utf-8') as f:
+            json_info = json.loads(f.read())
+
+        train_file_names, test_file_names = self.get_file_names()
+        config_paths = [self.__write_upload_conf(json_info, x) for x in train_file_names + test_file_names]
+
+        return config_paths
+
+    def generate_data_files(self):
+        data_path = run_config.DATA_PATH + DATA_SET + '.csv'
+        total_data_df = pd.read_csv(data_path, index_col=0)
+        if add_empty_column:
+            total_data_df['feature_0'] = 0.00001
+        train_file_names, test_file_names = self.get_file_names()
+        train_data, test_data = self.__split_train_test(total_data_df)
+
+        if not split_train_test_only:
+            if MODE == 'hetero':
+                self.__split_hetero(train_data, train_file_names)
+                self.__split_hetero(test_data, test_file_names)
+            else:
+                self.__split_homo(train_data, train_file_names)
+                self.__split_homo(test_data, test_file_names)
+        else:
+            train_data.to_csv(run_config.TEMP_DATA_PATH + train_file_names[0] + '.csv')
+            test_data.to_csv(run_config.TEMP_DATA_PATH + test_file_names[0] + '.csv')
+
+
+    @staticmethod
+    def __split_homo(total_train_df, file_names):
+        guest_num = math.floor(total_train_df.shape[0] * GUEST_RATIO)
+        guest_df = total_train_df[0: guest_num]
+        guest_df.to_csv(run_config.TEMP_DATA_PATH + file_names[0] + '.csv')
+
+        last_num = guest_num
+        for idx, host_ratio in enumerate(HOST_RATIOS[: -1]):
+            host_num = math.floor(total_train_df.shape[0] * host_ratio)
+            host_df = total_train_df[last_num: last_num + host_num]
+            host_df.to_csv(run_config.TEMP_DATA_PATH + file_names[idx + 1] + '.csv')
+            last_num += host_num
+        last_host_df = total_train_df.iloc[last_num:]
+        last_host_df.to_csv(run_config.TEMP_DATA_PATH + file_names[-1] + '.csv')
+
+    @staticmethod
+    def __split_hetero(total_train_df, file_names):
+        feature_nums = total_train_df.shape[1]
+        guest_feature_nums = math.floor(feature_nums * GUEST_RATIO)
+
+        guest_df = total_train_df.iloc[:, :guest_feature_nums]
+        guest_df.to_csv(run_config.TEMP_DATA_PATH + file_names[0] + '.csv')
+
+        last_num = guest_feature_nums
+        for idx, host_ratio in enumerate(HOST_RATIOS[: -1]):
+            host_feature_nums = math.floor(feature_nums * host_ratio)
+            host_df = total_train_df.iloc[:, last_num: last_num + host_feature_nums]
+            host_df.to_csv(run_config.TEMP_DATA_PATH + file_names[idx + 1] + '.csv')
+            last_num += host_feature_nums
+        last_host_df = total_train_df.iloc[:, last_num:]
+        last_host_df.to_csv(run_config.TEMP_DATA_PATH + file_names[-1] + '.csv')
+
+    @staticmethod
+    def __split_train_test(total_train_df):
+        total_num = total_train_df.shape[0]
+        train_num = int(total_num * TRAIN_RATIO)
+        train_data = total_train_df.iloc[:train_num]
+        test_data = total_train_df.iloc[train_num:]
+        return train_data, test_data
+
+    @staticmethod
+    def save_loaded_info(uploaded_files):
+        saved_result_path = run_config.SAVE_RESULT_PATH
+        with open(saved_result_path, 'r', encoding='utf-8') as f:
+            saved_result_json = json.loads(f.read())
+        uploaded_table_names = saved_result_json.get('uploaded_table_names')
+        uploaded_namespaces = saved_result_json.get('uploaded_namespaces')
+        for table_name, namespace in uploaded_files:
+            uploaded_table_names.append(table_name)
+            uploaded_namespaces.append(namespace)
+        saved_result_json['uploaded_table_names'] = uploaded_table_names
+        saved_result_json['uploaded_namespaces'] = uploaded_namespaces
+        config = json.dumps(saved_result_json, indent=4)
+        with open(saved_result_path, "w") as fout:
+            fout.write(config + "\n")
+
+    def check_tables(self, uploaded_files=None):
+        if uploaded_files is not None:
+            for table_name, namespace in uploaded_files:
+                self.get_table_info(table_name, namespace)
+        else:
+            saved_result_path = run_config.SAVE_RESULT_PATH
+            with open(saved_result_path, 'r', encoding='utf-8') as f:
+                saved_result_json = json.loads(f.read())
+            uploaded_table_names = saved_result_json.get('uploaded_table_names')
+            uploaded_namespaces = saved_result_json.get('uploaded_namespaces')
+            for table_name, namespace in zip(uploaded_table_names, uploaded_namespaces):
+                self.get_table_info(table_name, namespace)
+
+    def upload_data(self):
+        if need_generate_data:
+            self.generate_data_files()
+        config_paths = self.make_upload_conf()
+        uploaded_files = []
+        for path in config_paths:
+            cmd = ['python', run_config.FATE_FLOW_PATH, "-f", "upload", "-c", path]
+            stdout = self.start_task(cmd)
+            try:
+                table_info = (stdout['data']['table_name'], stdout['data']['namespace'])
+                uploaded_files.append(table_info)
+            except ValueError:
+                raise ValueError("Cannot obtain table info from stdout, stdout is : {}".format(stdout))
+            time.sleep(3)
+        self.save_loaded_info(uploaded_files)
+        self.check_tables(uploaded_files)
+
+    def destroy_data(self):
+        saved_result_path = run_config.SAVE_RESULT_PATH
+        with open(saved_result_path, 'r', encoding='utf-8') as f:
+            saved_result_json = json.loads(f.read())
+        uploaded_table_names = saved_result_json.get('uploaded_table_names')
+        uploaded_namespaces = saved_result_json.get('uploaded_namespaces')
+        for table_name, namespace in zip(uploaded_table_names, uploaded_namespaces):
+            cmd = ["python", run_config.FATE_FLOW_PATH, "-f", "table_delete", "-t", table_name, "-n", namespace]
+            stdout = self.start_task(cmd)
+            print("deleting {}, {}, stdout: {}".format(table_name, namespace, stdout))
+        saved_result_json['uploaded_table_names'] = []
+        saved_result_json['uploaded_namespaces'] = []
+        config = json.dumps(saved_result_json, indent=4)
+        with open(saved_result_path, "w") as fout:
+            fout.write(config + "\n")
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument('-f', '--func', required=False, type=str, help="run function",
+                        choices=('upload', 'check', 'destroy'), default='upload'
+                        )
+
+    try:
+        args = parser.parse_args()
+        data_task_obj = DataTask()
+        if args.func == 'upload':
+            data_task_obj.upload_data()
+        elif args.func == 'check':
+            data_task_obj.check_tables()
+        elif args.func == 'destroy':
+            data_task_obj.destroy_data()
+
+    except Exception as e:
+        exc_type, exc_value, exc_traceback_obj = sys.exc_info()
+        response = {'retcode': 100, 'retmsg': traceback.format_exception(exc_type, exc_value, exc_traceback_obj)}
+        print(json.dumps(response, indent=4))
+        print()