Skip to content
Snippets Groups Projects
Commit 33eb5cca authored by schmia98's avatar schmia98
Browse files

HPC laufen lassen

- slurm Skripts
- analysis.py 
- ...
parent 17230449
No related branches found
No related tags found
1 merge request!1David
......@@ -28,5 +28,5 @@ poetry shell
### Usage
```bash
python main.py config.yaml
python main.py config_adult_tvae.yaml
```
\ No newline at end of file
import math
import sys
from typing import Optional, Tuple
from sklearn.metrics import accuracy_score, roc_auc_score
......@@ -11,11 +12,11 @@ from sdv.single_table import TVAESynthesizer, CTGANSynthesizer
from sdv.metadata import SingleTableMetadata
from data_loader import adult, housing
from scipy.stats import entropy, pearsonr
import multiprocessing as mp
import os
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def LOGAN_D1(X_test: np.ndarray, X_synth: np.ndarray, X_ref: np.ndarray, epochs: int) -> np.ndarray:
num = min(X_synth.shape[0], X_ref.shape[0])
......@@ -184,91 +185,115 @@ def compute_metrics_baseline(
auc = roc_auc_score(y_true, y_scores, sample_weight=sample_weight)
return acc, auc
def housing() -> np.ndarray:
scaler = StandardScaler()
X = fetch_california_housing().data
np.random.shuffle(X)
return scaler.fit_transform(X)
def run_async(reference_set, non_mem_set, mem_set, synth_set, epochs, rep_total, results_list, gen_model, dataset):
print('Start:', reference_set.shape[0], mem_set.shape[0], synth_set.shape[0], epochs, rep_total)
results = {}
for rep in range(rep_total):
X_test = np.concatenate([mem_set, non_mem_set])
Y_test = np.concatenate(
[np.ones(mem_set.shape[0]), np.zeros(non_mem_set.shape[0])]
).astype(bool)
score = {}
score["LOGAN_D1"] = LOGAN_D1(X_test, synth_set.values, reference_set, epochs)
# score["MC"] = MC(X_test, synth_set.values)
# score["gan_leaks"] = GAN_leaks(X_test, synth_set.values)
score["metric_enhanced_PC"] = metric_enhanced(X_test, synth_set.values, reference_set, mem_set, epochs, enhancement_type='PC')
score["metric_enhanced_AVG"] = metric_enhanced(X_test, synth_set.values, reference_set, mem_set, epochs,
enhancement_type='AVG')
score["metric_enhanced_KLD"] = metric_enhanced(X_test, synth_set.values, reference_set, mem_set, epochs,
enhancement_type='KLD')
for name, y_scores in score.items():
acc, auc = compute_metrics_baseline(y_scores, Y_test, sample_weight=None)
results[name] = {
"accuracy": acc,
"aucroc": auc,
}
for name, metrics in results.items():
results_list.append({
"method": name,
"model": gen_model,
"dataset": dataset,
"reference_set_size": reference_set.shape[0],
"mem_set_size": mem_set.shape[0],
"synthetic_size": synth_set.shape[0],
"epochs": epochs,
"accuracy": metrics["accuracy"],
"aucroc": metrics["aucroc"]
})
print('End:', reference_set.shape[0], mem_set.shape[0], synth_set.shape[0], epochs, rep_total)
return results_list
def run_experiments(config):
dataset_functions = {
'housing': housing,
'adult': adult
}
dataset = dataset_functions[config.get('dataset', 'adult')]()
dataset_name = config.get('dataset')
dataset = dataset_functions[dataset_name]()
ndata = dataset.shape[0]
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(pd.DataFrame(dataset))
mem_set_size_list = config.get('mem_set_size_list', [20])
reference_set_size_list = config.get('reference_set_size_list', [100])
synthetic_sizes = config.get('synthetic_sizes', [100])
epochs_list = config.get('epochs_list', [300])
rep_total = config.get('rep_total', 10)
results_list = []
experiment_len = len(mem_set_size_list) * len(reference_set_size_list) * len(synthetic_sizes) * len(epochs_list) * rep_total
i = 0
for reference_set_size in reference_set_size_list:
for mem_set_size in mem_set_size_list:
if mem_set_size * 2 + reference_set_size >= ndata:
# mem_set_size or ref_set_size too large
continue
for epochs in epochs_list:
results = {}
for rep in range(rep_total):
np.random.shuffle(dataset)
mem_set = dataset[:mem_set_size]
non_mem_set = dataset[mem_set_size : 2 * mem_set_size]
reference_set = dataset[-reference_set_size:]
df = pd.DataFrame(mem_set)
df.columns = [str(_) for _ in range(dataset.shape[1])]
generator = TVAESynthesizer(metadata=metadata)
generator.fit(df)
for synthetic_size in synthetic_sizes:
i += 1
# Create synthethic sets
synth_set = generator.sample(synthetic_size)
X_test = np.concatenate([mem_set, non_mem_set])
Y_test = np.concatenate(
[np.ones(mem_set.shape[0]), np.zeros(non_mem_set.shape[0])]
).astype(bool)
score = {}
score["LOGAN_D1"] = LOGAN_D1(X_test, synth_set.values, reference_set, epochs)
# score["MC"] = MC(X_test, synth_set.values)
# score["gan_leaks"] = GAN_leaks(X_test, synth_set.values)
# score["metric_enhanced_PC"] = metric_enhanced(X_test, synth_set.values, reference_set, mem_set, epochs, enhancement_type='PC')
score["metric_enhanced_AVG"] = metric_enhanced(X_test, synth_set.values, reference_set, mem_set, epochs, enhancement_type='AVG')
score["metric_enhanced_KLD"] = metric_enhanced(X_test, synth_set.values, reference_set, mem_set, epochs, enhancement_type='KLD')
for name, y_scores in score.items():
acc, auc = compute_metrics_baseline(y_scores, Y_test, sample_weight=None)
results[name] = {
"accuracy": acc,
"aucroc": auc,
}
sys.stdout.write('\r')
j = i / experiment_len
sys.stdout.write("[%-20s] %d%%" % ('='*int(20*j), 100*j))
sys.stdout.write(f' - reference_set_size, mem_set_size, synthetic_size, epochs: {reference_set_size}, {mem_set_size}, {synthetic_size}, {epochs}')
sys.stdout.flush()
for name, metrics in results.items():
results_list.append({
"method": name,
"reference_set_size": reference_set_size,
"mem_set_size": mem_set_size,
"synthetic_size": synthetic_size,
"epochs": epochs,
"accuracy": metrics["accuracy"],
"aucroc": metrics["aucroc"]
})
mem_set_size_list = config.get('mem_set_size_list')
reference_set_size_list = config.get('reference_set_size_list')
synthetic_sizes = config.get('synthetic_sizes')
epochs_list = config.get('epochs_list')
rep_total = config.get('rep_total')
gen_model = config.get('model')
global_results = []
job_pool = mp.Pool(max(int(os.environ['SLURM_CPUS_PER_TASK']) if 'SLURM_CPUS_PER_TASK' in os.environ else 0, int(mp.cpu_count()*0.25)))
print(max(int(os.environ['SLURM_CPUS_PER_TASK']) if 'SLURM_CPUS_PER_TASK' in os.environ else 0, int(mp.cpu_count()*0.25)))
for mem_set_size in mem_set_size_list:
np.random.shuffle(dataset)
mem_set = dataset[:mem_set_size]
df = pd.DataFrame(mem_set)
df.columns = [str(_) for _ in range(dataset.shape[1])]
if gen_model == 'tvae':
generator = TVAESynthesizer(metadata=metadata)
else:
generator = CTGANSynthesizer(metadata=metadata)
generator.fit(df)
for synthetic_size in synthetic_sizes:
# Create synthethic sets
synth_set = generator.sample(synthetic_size)
for reference_set_size in reference_set_size_list:
if mem_set_size * 2 + reference_set_size >= ndata:
# mem_set_size or ref_set_size too large
continue
non_mem_set = dataset[mem_set_size: 2 * mem_set_size]
reference_set = dataset[-reference_set_size:]
results_list = []
async_result = []
for epochs in epochs_list:
async_result.append(job_pool.apply_async(run_async, [reference_set, non_mem_set, mem_set, synth_set, epochs, rep_total, results_list, gen_model, dataset_name]))
for result in async_result:
results_list += result.get()
tmp_df = pd.DataFrame(results_list)
tmp_df.to_csv(os.path.join('results', "%s_%s_m%d_s%d_r%d.csv" % (dataset_name, gen_model, mem_set_size, synthetic_size, reference_set_size)), sep=';', index=False)
global_results.append(results_list)
sys.stdout.write('\n')
return results_list
return global_results
dataset: adult # or housing
mem_set_size_list: [20]
reference_set_size_list: [100]
synthetic_sizes: [100]
epochs_list: [300]
rep_total: 10
dataset: [housing, adult]
mem_set_size_list: [10,20,30,40,50,60,70,80,90,100,150,200,250,300,350,400,450,500,550,600,650,700,750,800,850,900,950,1000]
reference_set_size_list: [10,20,30,40,50,60,70,80,90,100,300,500,700,900,1100]
synthetic_sizes: [10,20,30,40,50,60,70,80,90,100,300,500,700,900,1100]
epochs_list: [100,200,300,400]
rep_total: 50
model: [tvae, gan]
\ No newline at end of file
dataset: adult
mem_set_size_list: [10,20,30,40,50,60,70,80,90,100,150,200,250,300,350,400,450,500,550,600,650,700,750,800,850,900,950,1000]
reference_set_size_list: [10,20,30,40,50,60,70,80,90,100,300,500,700,900,1100]
synthetic_sizes: [10,20,30,40,50,60,70,80,90,100,300,500,700,900,1100]
epochs_list: [100,200,300,400]
rep_total: 50
model: gan
dataset: adult
mem_set_size_list: [10,20,30,40,50,60,70,80,90,100,150,200,250,300,350,400,450,500,550,600,650,700,750,800,850,900,950,1000]
reference_set_size_list: [10,20,30,40,50,60,70,80,90,100,300,500,700,900,1100]
synthetic_sizes: [10,20,30,40,50,60,70,80,90,100,300,500,700,900,1100]
epochs_list: [100,200,300,400]
rep_total: 50
model: tvae
dataset: housing
mem_set_size_list: [10,20,30,40,50,60,70,80,90,100,150,200,250,300,350,400,450,500,550,600,650,700,750,800,850,900,950,1000]
reference_set_size_list: [10,20,30,40,50,60,70,80,90,100,300,500,700,900,1100]
synthetic_sizes: [10,20,30,40,50,60,70,80,90,100,300,500,700,900,1100]
epochs_list: [100,200,300,400]
rep_total: 50
model: gan
dataset: housing
mem_set_size_list: [10,20,30,40,50,60,70,80,90,100,150,200,250,300,350,400,450,500,550,600,650,700,750,800,850,900,950,1000]
reference_set_size_list: [10,20,30,40,50,60,70,80,90,100,300,500,700,900,1100]
synthetic_sizes: [10,20,30,40,50,60,70,80,90,100,300,500,700,900,1100]
epochs_list: [100,200,300,400]
rep_total: 50
model: tvae
......@@ -4,6 +4,9 @@ import sys
from datetime import datetime
import pandas as pd
import multiprocessing as mp
from attack import run_experiments
......@@ -26,6 +29,8 @@ def main(config_file):
if __name__ == "__main__":
mp.set_start_method('spawn')
if len(sys.argv) != 2:
print("Usage: python main.py <config_file>")
sys.exit(1)
......
import pandas as pd
import matplotlib.pyplot as plt
import os
import numpy as np
from statsmodels.api import OLS
import matplotlib
from sklearn.tree import DecisionTreeRegressor
# os.chdir('test_data')
#%%
file_list = os.listdir()
all_data = pd.DataFrame()
for f in file_list:
if f.endswith('.csv') and 'tmp_all_data' not in f:
props = f[:-4].split('_')
tmp_df = pd.read_csv(f, sep=';')
tmp_df['experiment'] = f
# tmp_df = tmp_df[np.all(tmp_df[[ 'mem_set_size', 'reference_set_size', 'synthetic_size', 'epochs']] - [int(p[1:]) for p in props] == 0, axis=1)]
all_data = pd.concat([all_data,tmp_df], axis=0, ignore_index=True)
all_data.to_csv('tmp_all_data.csv', sep=';', index=False)
#%%
# all_data.drop(columns=['method']).corr()
#%%
# all_data = pd.read_csv('tmp_all_data.csv', sep=';', low_memory=True)
all_data.set_index('experiment', inplace=True)
new_df = pd.concat([all_data.drop(columns=['method']),pd.get_dummies(all_data['method']).astype(int)], axis=1)
desc = new_df.describe()
#%%
new_df_w_intercept = new_df.copy()
new_df_w_intercept['intercept'] = 1
lm1 = OLS(new_df_w_intercept['accuracy'], new_df_w_intercept.drop(columns=['aucroc', 'accuracy']))
result1 = lm1.fit()
print(result1.summary())
lm2 = OLS(new_df_w_intercept['aucroc'], new_df_w_intercept.drop(columns=['aucroc', 'accuracy']))
result2 = lm2.fit()
print(result2.summary())
#%%
dtr = DecisionTreeRegressor()
dtr.fit(new_df.drop(columns=['aucroc', 'accuracy']), new_df['accuracy'],)
print(dict(zip(new_df.drop(columns=['aucroc', 'accuracy']),dtr.feature_importances_)))
#%%
def plot_with_varying_mem_set_size(this_data):
plt.figure()
plt.tight_layout()
print(this_data['synthetic_size'])
cmap = matplotlib.cm.get_cmap('Dark2')
meths_list = this_data['method'].unique().tolist()
legend_list = []
for m in range(len(meths_list)):
col = cmap(m)
ax = plt.gca()
x = this_data[this_data['method'] == meths_list[m]]['synthetic_size'].unique()
y_avg = this_data[this_data['method'] == meths_list[m]].groupby('synthetic_size').mean(numeric_only=True)['accuracy']
y_min = this_data[this_data['method'] == meths_list[m]].groupby('synthetic_size').min(numeric_only=True)['accuracy']
y_max = this_data[this_data['method'] == meths_list[m]].groupby('synthetic_size').max(numeric_only=True)['accuracy']
plt.plot(x, y_avg, color=col)
ax.fill_between(x, y_min, y_max, color=col, alpha=.3)
legend_list.append(meths_list[m])
legend_list.append(meths_list[m] + ' (Min/Max)')
plt.legend(legend_list)
r = this_data['reference_set_size'][0]
s = this_data['mem_set_size'][0]
e = this_data['epochs'][0]
plt.savefig("fig_mem_set_size_" + str(r) + "_" + str(s) + "_" + str(e) + ".png")
all_data.groupby(['reference_set_size', 'mem_set_size', 'epochs'], as_index=False).apply(lambda b: plot_with_varying_mem_set_size(b))
File moved
File moved
#!/bin/bash
#SBATCH --job-name=metric_mia
#SBATCH --partition=gpu
#SBATCH --gres=gpu:1
#SBATCH --nodes=2
#SBATCH --ntasks=4
#SBATCH --cpus-per-task=20
#SBATCH --mem-per-cpu=500
#SBATCH --time=7-00:00:00
#SBATCH --output=job_%j.out
#SBATCH --error=job_%j.err
#SBATCH --qos=standard
# Reporting
#SBATCH --mail-type=BEGIN,END,FAIL
#SBATCH --mail-user=dar@zedat.fu-berlin.de
# store job info in output file, if you want...
scontrol show job $SLURM_JOBID
rm results/*
echo 'Virtual Environement aktivieren'
source mia_3.11/bin/activate
echo 'Python-Skript starten'
srun --exact -n1 -G1 python3.11 main.py config_adult_gan.yaml &
srun --exact -n1 -G1 python3.11 main.py config_housing_gan.yaml &
srun --exact -n1 -G1 python3.11 main.py config_adult_tvae.yaml &
srun --exact -n1 -G1 python3.11 main.py config_housing_tvae.yaml &
wait
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment