From 33eb5ccaa6ed3fc1e1616fa107ad04e7d3ac0dde Mon Sep 17 00:00:00 2001 From: schmia98 <schmia98@zedat.fu-berlin.de> Date: Mon, 9 Sep 2024 12:33:22 +0000 Subject: [PATCH] HPC laufen lassen - slurm Skripts - analysis.py - ... --- README.md | 2 +- attack.py | 183 ++++++++++-------- config.yaml | 13 +- config_adult_gan.yaml | 7 + config_adult_tvae.yaml | 7 + config_housing_gan.yaml | 7 + config_housing_tvae.yaml | 7 + main.py | 5 + result_analysis/analysis.py | 77 ++++++++ .../appendix.ipynb | 0 .../plot_results.ipynb | 0 run_slurm.sh | 37 ++++ 12 files changed, 259 insertions(+), 86 deletions(-) create mode 100644 config_adult_gan.yaml create mode 100644 config_adult_tvae.yaml create mode 100644 config_housing_gan.yaml create mode 100644 config_housing_tvae.yaml create mode 100644 result_analysis/analysis.py rename appendix.ipynb => result_analysis/appendix.ipynb (100%) rename plot_results.ipynb => result_analysis/plot_results.ipynb (100%) create mode 100755 run_slurm.sh diff --git a/README.md b/README.md index a8b4ad7..fab1264 100644 --- a/README.md +++ b/README.md @@ -28,5 +28,5 @@ poetry shell ### Usage ```bash -python main.py config.yaml +python main.py config_adult_tvae.yaml ``` \ No newline at end of file diff --git a/attack.py b/attack.py index aebee1e..d5ac2da 100644 --- a/attack.py +++ b/attack.py @@ -1,3 +1,4 @@ +import math import sys from typing import Optional, Tuple from sklearn.metrics import accuracy_score, roc_auc_score @@ -11,11 +12,11 @@ from sdv.single_table import TVAESynthesizer, CTGANSynthesizer from sdv.metadata import SingleTableMetadata from data_loader import adult, housing from scipy.stats import entropy, pearsonr - +import multiprocessing as mp +import os DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") - def LOGAN_D1(X_test: np.ndarray, X_synth: np.ndarray, X_ref: np.ndarray, epochs: int) -> np.ndarray: num = min(X_synth.shape[0], X_ref.shape[0]) @@ -184,91 +185,115 @@ def compute_metrics_baseline( auc = roc_auc_score(y_true, y_scores, sample_weight=sample_weight) return acc, auc +def housing() -> np.ndarray: + scaler = StandardScaler() + X = fetch_california_housing().data + np.random.shuffle(X) + return scaler.fit_transform(X) + +def run_async(reference_set, non_mem_set, mem_set, synth_set, epochs, rep_total, results_list, gen_model, dataset): + print('Start:', reference_set.shape[0], mem_set.shape[0], synth_set.shape[0], epochs, rep_total) + results = {} + for rep in range(rep_total): + + X_test = np.concatenate([mem_set, non_mem_set]) + Y_test = np.concatenate( + [np.ones(mem_set.shape[0]), np.zeros(non_mem_set.shape[0])] + ).astype(bool) + + score = {} + score["LOGAN_D1"] = LOGAN_D1(X_test, synth_set.values, reference_set, epochs) + # score["MC"] = MC(X_test, synth_set.values) + # score["gan_leaks"] = GAN_leaks(X_test, synth_set.values) + score["metric_enhanced_PC"] = metric_enhanced(X_test, synth_set.values, reference_set, mem_set, epochs, enhancement_type='PC') + score["metric_enhanced_AVG"] = metric_enhanced(X_test, synth_set.values, reference_set, mem_set, epochs, + enhancement_type='AVG') + score["metric_enhanced_KLD"] = metric_enhanced(X_test, synth_set.values, reference_set, mem_set, epochs, + enhancement_type='KLD') + + for name, y_scores in score.items(): + acc, auc = compute_metrics_baseline(y_scores, Y_test, sample_weight=None) + results[name] = { + "accuracy": acc, + "aucroc": auc, + } + + for name, metrics in results.items(): + results_list.append({ + "method": name, + "model": gen_model, + "dataset": dataset, + "reference_set_size": reference_set.shape[0], + "mem_set_size": mem_set.shape[0], + "synthetic_size": synth_set.shape[0], + "epochs": epochs, + "accuracy": metrics["accuracy"], + "aucroc": metrics["aucroc"] + }) + print('End:', reference_set.shape[0], mem_set.shape[0], synth_set.shape[0], epochs, rep_total) + return results_list + def run_experiments(config): dataset_functions = { 'housing': housing, 'adult': adult } - - dataset = dataset_functions[config.get('dataset', 'adult')]() + dataset_name = config.get('dataset') + dataset = dataset_functions[dataset_name]() ndata = dataset.shape[0] metadata = SingleTableMetadata() metadata.detect_from_dataframe(pd.DataFrame(dataset)) - mem_set_size_list = config.get('mem_set_size_list', [20]) - reference_set_size_list = config.get('reference_set_size_list', [100]) - synthetic_sizes = config.get('synthetic_sizes', [100]) - epochs_list = config.get('epochs_list', [300]) - rep_total = config.get('rep_total', 10) - - results_list = [] - - experiment_len = len(mem_set_size_list) * len(reference_set_size_list) * len(synthetic_sizes) * len(epochs_list) * rep_total - i = 0 - - for reference_set_size in reference_set_size_list: - for mem_set_size in mem_set_size_list: - if mem_set_size * 2 + reference_set_size >= ndata: - # mem_set_size or ref_set_size too large - continue - for epochs in epochs_list: - - results = {} - - for rep in range(rep_total): - np.random.shuffle(dataset) - mem_set = dataset[:mem_set_size] - non_mem_set = dataset[mem_set_size : 2 * mem_set_size] - reference_set = dataset[-reference_set_size:] - - df = pd.DataFrame(mem_set) - df.columns = [str(_) for _ in range(dataset.shape[1])] - - generator = TVAESynthesizer(metadata=metadata) - - generator.fit(df) - - for synthetic_size in synthetic_sizes: - i += 1 - # Create synthethic sets - synth_set = generator.sample(synthetic_size) - - X_test = np.concatenate([mem_set, non_mem_set]) - Y_test = np.concatenate( - [np.ones(mem_set.shape[0]), np.zeros(non_mem_set.shape[0])] - ).astype(bool) - - score = {} - score["LOGAN_D1"] = LOGAN_D1(X_test, synth_set.values, reference_set, epochs) - # score["MC"] = MC(X_test, synth_set.values) - # score["gan_leaks"] = GAN_leaks(X_test, synth_set.values) - # score["metric_enhanced_PC"] = metric_enhanced(X_test, synth_set.values, reference_set, mem_set, epochs, enhancement_type='PC') - score["metric_enhanced_AVG"] = metric_enhanced(X_test, synth_set.values, reference_set, mem_set, epochs, enhancement_type='AVG') - score["metric_enhanced_KLD"] = metric_enhanced(X_test, synth_set.values, reference_set, mem_set, epochs, enhancement_type='KLD') - - for name, y_scores in score.items(): - acc, auc = compute_metrics_baseline(y_scores, Y_test, sample_weight=None) - results[name] = { - "accuracy": acc, - "aucroc": auc, - } - - sys.stdout.write('\r') - j = i / experiment_len - sys.stdout.write("[%-20s] %d%%" % ('='*int(20*j), 100*j)) - sys.stdout.write(f' - reference_set_size, mem_set_size, synthetic_size, epochs: {reference_set_size}, {mem_set_size}, {synthetic_size}, {epochs}') - sys.stdout.flush() - - for name, metrics in results.items(): - results_list.append({ - "method": name, - "reference_set_size": reference_set_size, - "mem_set_size": mem_set_size, - "synthetic_size": synthetic_size, - "epochs": epochs, - "accuracy": metrics["accuracy"], - "aucroc": metrics["aucroc"] - }) + mem_set_size_list = config.get('mem_set_size_list') + reference_set_size_list = config.get('reference_set_size_list') + synthetic_sizes = config.get('synthetic_sizes') + epochs_list = config.get('epochs_list') + rep_total = config.get('rep_total') + gen_model = config.get('model') + + global_results = [] + + job_pool = mp.Pool(max(int(os.environ['SLURM_CPUS_PER_TASK']) if 'SLURM_CPUS_PER_TASK' in os.environ else 0, int(mp.cpu_count()*0.25))) + print(max(int(os.environ['SLURM_CPUS_PER_TASK']) if 'SLURM_CPUS_PER_TASK' in os.environ else 0, int(mp.cpu_count()*0.25))) + for mem_set_size in mem_set_size_list: + + np.random.shuffle(dataset) + mem_set = dataset[:mem_set_size] + + df = pd.DataFrame(mem_set) + df.columns = [str(_) for _ in range(dataset.shape[1])] + + if gen_model == 'tvae': + generator = TVAESynthesizer(metadata=metadata) + else: + generator = CTGANSynthesizer(metadata=metadata) + + generator.fit(df) + + for synthetic_size in synthetic_sizes: + # Create synthethic sets + synth_set = generator.sample(synthetic_size) + for reference_set_size in reference_set_size_list: + + if mem_set_size * 2 + reference_set_size >= ndata: + # mem_set_size or ref_set_size too large + continue + + non_mem_set = dataset[mem_set_size: 2 * mem_set_size] + reference_set = dataset[-reference_set_size:] + + results_list = [] + async_result = [] + for epochs in epochs_list: + async_result.append(job_pool.apply_async(run_async, [reference_set, non_mem_set, mem_set, synth_set, epochs, rep_total, results_list, gen_model, dataset_name])) + + for result in async_result: + results_list += result.get() + + tmp_df = pd.DataFrame(results_list) + tmp_df.to_csv(os.path.join('results', "%s_%s_m%d_s%d_r%d.csv" % (dataset_name, gen_model, mem_set_size, synthetic_size, reference_set_size)), sep=';', index=False) + global_results.append(results_list) + sys.stdout.write('\n') - return results_list + return global_results diff --git a/config.yaml b/config.yaml index 83ca437..af3075f 100644 --- a/config.yaml +++ b/config.yaml @@ -1,6 +1,7 @@ -dataset: adult # or housing -mem_set_size_list: [20] -reference_set_size_list: [100] -synthetic_sizes: [100] -epochs_list: [300] -rep_total: 10 +dataset: [housing, adult] +mem_set_size_list: [10,20,30,40,50,60,70,80,90,100,150,200,250,300,350,400,450,500,550,600,650,700,750,800,850,900,950,1000] +reference_set_size_list: [10,20,30,40,50,60,70,80,90,100,300,500,700,900,1100] +synthetic_sizes: [10,20,30,40,50,60,70,80,90,100,300,500,700,900,1100] +epochs_list: [100,200,300,400] +rep_total: 50 +model: [tvae, gan] \ No newline at end of file diff --git a/config_adult_gan.yaml b/config_adult_gan.yaml new file mode 100644 index 0000000..33b34c6 --- /dev/null +++ b/config_adult_gan.yaml @@ -0,0 +1,7 @@ +dataset: adult +mem_set_size_list: [10,20,30,40,50,60,70,80,90,100,150,200,250,300,350,400,450,500,550,600,650,700,750,800,850,900,950,1000] +reference_set_size_list: [10,20,30,40,50,60,70,80,90,100,300,500,700,900,1100] +synthetic_sizes: [10,20,30,40,50,60,70,80,90,100,300,500,700,900,1100] +epochs_list: [100,200,300,400] +rep_total: 50 +model: gan diff --git a/config_adult_tvae.yaml b/config_adult_tvae.yaml new file mode 100644 index 0000000..9ae793e --- /dev/null +++ b/config_adult_tvae.yaml @@ -0,0 +1,7 @@ +dataset: adult +mem_set_size_list: [10,20,30,40,50,60,70,80,90,100,150,200,250,300,350,400,450,500,550,600,650,700,750,800,850,900,950,1000] +reference_set_size_list: [10,20,30,40,50,60,70,80,90,100,300,500,700,900,1100] +synthetic_sizes: [10,20,30,40,50,60,70,80,90,100,300,500,700,900,1100] +epochs_list: [100,200,300,400] +rep_total: 50 +model: tvae diff --git a/config_housing_gan.yaml b/config_housing_gan.yaml new file mode 100644 index 0000000..3b73dc3 --- /dev/null +++ b/config_housing_gan.yaml @@ -0,0 +1,7 @@ +dataset: housing +mem_set_size_list: [10,20,30,40,50,60,70,80,90,100,150,200,250,300,350,400,450,500,550,600,650,700,750,800,850,900,950,1000] +reference_set_size_list: [10,20,30,40,50,60,70,80,90,100,300,500,700,900,1100] +synthetic_sizes: [10,20,30,40,50,60,70,80,90,100,300,500,700,900,1100] +epochs_list: [100,200,300,400] +rep_total: 50 +model: gan diff --git a/config_housing_tvae.yaml b/config_housing_tvae.yaml new file mode 100644 index 0000000..028f89d --- /dev/null +++ b/config_housing_tvae.yaml @@ -0,0 +1,7 @@ +dataset: housing +mem_set_size_list: [10,20,30,40,50,60,70,80,90,100,150,200,250,300,350,400,450,500,550,600,650,700,750,800,850,900,950,1000] +reference_set_size_list: [10,20,30,40,50,60,70,80,90,100,300,500,700,900,1100] +synthetic_sizes: [10,20,30,40,50,60,70,80,90,100,300,500,700,900,1100] +epochs_list: [100,200,300,400] +rep_total: 50 +model: tvae diff --git a/main.py b/main.py index 50995a7..2ab42e7 100644 --- a/main.py +++ b/main.py @@ -4,6 +4,9 @@ import sys from datetime import datetime import pandas as pd +import multiprocessing as mp + + from attack import run_experiments @@ -26,6 +29,8 @@ def main(config_file): if __name__ == "__main__": + + mp.set_start_method('spawn') if len(sys.argv) != 2: print("Usage: python main.py <config_file>") sys.exit(1) diff --git a/result_analysis/analysis.py b/result_analysis/analysis.py new file mode 100644 index 0000000..8486eb0 --- /dev/null +++ b/result_analysis/analysis.py @@ -0,0 +1,77 @@ +import pandas as pd +import matplotlib.pyplot as plt +import os +import numpy as np +from statsmodels.api import OLS +import matplotlib +from sklearn.tree import DecisionTreeRegressor + +# os.chdir('test_data') +#%% +file_list = os.listdir() +all_data = pd.DataFrame() + +for f in file_list: + if f.endswith('.csv') and 'tmp_all_data' not in f: + props = f[:-4].split('_') + tmp_df = pd.read_csv(f, sep=';') + tmp_df['experiment'] = f + # tmp_df = tmp_df[np.all(tmp_df[[ 'mem_set_size', 'reference_set_size', 'synthetic_size', 'epochs']] - [int(p[1:]) for p in props] == 0, axis=1)] + all_data = pd.concat([all_data,tmp_df], axis=0, ignore_index=True) + +all_data.to_csv('tmp_all_data.csv', sep=';', index=False) +#%% +# all_data.drop(columns=['method']).corr() +#%% +# all_data = pd.read_csv('tmp_all_data.csv', sep=';', low_memory=True) +all_data.set_index('experiment', inplace=True) +new_df = pd.concat([all_data.drop(columns=['method']),pd.get_dummies(all_data['method']).astype(int)], axis=1) +desc = new_df.describe() +#%% +new_df_w_intercept = new_df.copy() +new_df_w_intercept['intercept'] = 1 + +lm1 = OLS(new_df_w_intercept['accuracy'], new_df_w_intercept.drop(columns=['aucroc', 'accuracy'])) +result1 = lm1.fit() +print(result1.summary()) + +lm2 = OLS(new_df_w_intercept['aucroc'], new_df_w_intercept.drop(columns=['aucroc', 'accuracy'])) +result2 = lm2.fit() +print(result2.summary()) +#%% +dtr = DecisionTreeRegressor() +dtr.fit(new_df.drop(columns=['aucroc', 'accuracy']), new_df['accuracy'],) +print(dict(zip(new_df.drop(columns=['aucroc', 'accuracy']),dtr.feature_importances_))) +#%% +def plot_with_varying_mem_set_size(this_data): + plt.figure() + plt.tight_layout() + print(this_data['synthetic_size']) + cmap = matplotlib.cm.get_cmap('Dark2') + meths_list = this_data['method'].unique().tolist() + + legend_list = [] + + for m in range(len(meths_list)): + col = cmap(m) + ax = plt.gca() + x = this_data[this_data['method'] == meths_list[m]]['synthetic_size'].unique() + y_avg = this_data[this_data['method'] == meths_list[m]].groupby('synthetic_size').mean(numeric_only=True)['accuracy'] + y_min = this_data[this_data['method'] == meths_list[m]].groupby('synthetic_size').min(numeric_only=True)['accuracy'] + y_max = this_data[this_data['method'] == meths_list[m]].groupby('synthetic_size').max(numeric_only=True)['accuracy'] + + plt.plot(x, y_avg, color=col) + ax.fill_between(x, y_min, y_max, color=col, alpha=.3) + + legend_list.append(meths_list[m]) + legend_list.append(meths_list[m] + ' (Min/Max)') + + plt.legend(legend_list) + + r = this_data['reference_set_size'][0] + s = this_data['mem_set_size'][0] + e = this_data['epochs'][0] + plt.savefig("fig_mem_set_size_" + str(r) + "_" + str(s) + "_" + str(e) + ".png") + +all_data.groupby(['reference_set_size', 'mem_set_size', 'epochs'], as_index=False).apply(lambda b: plot_with_varying_mem_set_size(b)) + diff --git a/appendix.ipynb b/result_analysis/appendix.ipynb similarity index 100% rename from appendix.ipynb rename to result_analysis/appendix.ipynb diff --git a/plot_results.ipynb b/result_analysis/plot_results.ipynb similarity index 100% rename from plot_results.ipynb rename to result_analysis/plot_results.ipynb diff --git a/run_slurm.sh b/run_slurm.sh new file mode 100755 index 0000000..68ccc33 --- /dev/null +++ b/run_slurm.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH --job-name=metric_mia + +#SBATCH --partition=gpu +#SBATCH --gres=gpu:1 + +#SBATCH --nodes=2 +#SBATCH --ntasks=4 +#SBATCH --cpus-per-task=20 +#SBATCH --mem-per-cpu=500 + +#SBATCH --time=7-00:00:00 + +#SBATCH --output=job_%j.out +#SBATCH --error=job_%j.err +#SBATCH --qos=standard + +# Reporting +#SBATCH --mail-type=BEGIN,END,FAIL +#SBATCH --mail-user=dar@zedat.fu-berlin.de + + +# store job info in output file, if you want... +scontrol show job $SLURM_JOBID + +rm results/* + +echo 'Virtual Environement aktivieren' +source mia_3.11/bin/activate + +echo 'Python-Skript starten' +srun --exact -n1 -G1 python3.11 main.py config_adult_gan.yaml & +srun --exact -n1 -G1 python3.11 main.py config_housing_gan.yaml & +srun --exact -n1 -G1 python3.11 main.py config_adult_tvae.yaml & +srun --exact -n1 -G1 python3.11 main.py config_housing_tvae.yaml & +wait -- GitLab