import os
from af.controller.data.DataFactory import DataFactory
from af.controller.data.SqliteController import SqliteController
from af.model.hierarchies.BaseHierarchy import BaseHierarchy
from af.utils import (
ANONYMIZATION_DIRECTORY,
ANONYMIZATION_DB_NAME,
ADDITIONAL_INFO_TABLE,
PRIVACY_TYPE_IDENTIFIER,
)
[docs]class PreProcessingStage(object):
"""Class used by the BaseAlgorithm to prepare the complete scenario before anonymizing.
"""
def __init__(self, data_config):
self.data_config = data_config
self.initial_db_location = self.data_config.location
self.db_location = os.path.join(ANONYMIZATION_DIRECTORY, ANONYMIZATION_DB_NAME)
self.table = self.data_config.table
self.db_controller = None
[docs] def preprocess(self):
"""Method that calls all the necessary steps before a data transformation
"""
self.clean_previous_work()
self.create_db_copy()
self.db_controller = SqliteController(self.db_location)
self.remove_identifiable_attributes()
self.set_indexes_over_qi()
self.create_additional_information_table()
[docs] def clean_previous_work(self):
"""If an anonymized db from a previous session exists, then delete it.
db_location contains the path where the new db will be created
"""
if os.path.isfile(self.db_location):
os.remove(self.db_location)
[docs] def create_db_copy(self):
"""Takes the original DB and creates a new copy ready to be manipulated and modified.
It preserves the state of the db with the raw data.
initial_db_location: contains the original path to the raw db.
controller: given the original db extension, it creates an instance of a db controller capable of querying the db.
"""
extension = os.path.basename(self.initial_db_location).split('.')[1]
controller = DataFactory.get_controller_from_extension(extension)
controller.create_db_copy(self.initial_db_location, self.db_location)
[docs] def remove_identifiable_attributes(self):
"""Given a list of identifiable attributes, it suppreses them, as they are forbidden to appear in any form once the data is anonymized.
identifiable_list: contains all those attributes from the data config that were selected as Identifiable.
supression_value: default string based on the supression node ('**********')
query: simple update query, to set values to supression_value for all identifiable attributes.
"""
identifiable_list = [att.name for att in self.data_config.get_privacy_type_attributes_list(PRIVACY_TYPE_IDENTIFIER)]
supression_value = BaseHierarchy.supression_node()
update_ident_list = ["{0}='{1}'".format(att, supression_value) for att in identifiable_list]
query = "UPDATE {0} SET {1};".format(self.table, ', '.join(update_ident_list))
list(self.db_controller.execute_query(query))
[docs] def set_indexes_over_qi(self):
"""In order to make queries more efficients, we set indexes over each attribute selected as Quasi-identifable,
and a composite index over all of them.
"""
qi_list = []
# Individual indexes
for att in self.data_config.get_privacy_type_attributes_list():
query = "CREATE INDEX {0}_index ON {1} ({2});".format(att.name,
self.table,
att.name)
list(self.db_controller.execute_query(query))
qi_list.append(att.name)
# Composite index
query = "CREATE INDEX qi_index ON {0} ({1});".format(self.table, ', '.join(qi_list))
list(self.db_controller.execute_query(query))