--load the synthetic dataset
create table synthetic_dataset (
employee_first_name varchar2(20),
employee_last_name varchar2(30),
ssn varchar2(9),
date_of_birth date
);
insert into synthetic_dataset
(employee_first_name, employee_last_name, ssn, date_of_birth)
values
('asodké-qf31ñ','örsenFeld jr','123456789','23-Oct-1912');
insert into synthetic_dataset
(employee_first_name, employee_last_name, ssn, date_of_birth)
values
('Mr.Rùteyndô','Plånçk-Vladímírä','612344564','22-Jul-1978');
insert into synthetic_dataset
(employee_first_name, employee_last_name, ssn, date_of_birth)
values
('kãmiila','ĆlasS Sr.','367863592','22-May-1981');
insert into synthetic_dataset
(employee_first_name, employee_last_name, ssn, date_of_birth)
values
('DR. PABLÜ','VINCCI','150007103','16-Mar-1999');
insert into synthetic_dataset
(employee_first_name, employee_last_name, ssn, date_of_birth)
values
('mrs. Końšttantin','Amatores','121212121','12-Dec-2018');
insert into synthetic_dataset
(employee_first_name, employee_last_name, ssn, date_of_birth)
values
('Káillynoa','Christensen','666997221','5-Jun-1903');
insert into synthetic_dataset
(employee_last_name, ssn, date_of_birth)
values
('Van Nifterik','524755718','23-Apr-1908');
insert into synthetic_dataset
(employee_first_name, employee_last_name, ssn, date_of_birth)
values
('Gödöfredö','Stjepanić','078051120','01-Feb-2022');
insert into synthetic_dataset
(employee_first_name, employee_last_name, ssn, date_of_birth)
values
('Akjhil','al-Prinsen Kiruch','818681414','18-Jun-1948');
insert into synthetic_dataset
(employee_first_name, employee_last_name, date_of_birth)
values
('DINN3SH - Q','Lévêque Soüçe','21-Sep-1987');
/*translate the hashing code
--4. CLEAN RAW DATA
--Strip spaces and symbols from names
--Ensure that DOBS are valid dates
--Ensure that SSNs are strings*/
update
synthetic_dataset
set
--Clean names
--Convert to lowercase and strip leading and trailing whitespace
employee_first_name=trim(lower(employee_first_name)),
employee_last_name=trim(lower(employee_last_name));
create table clean_data
as select regexp_replace(employee_first_name, 's/^(mr\ |mrs\ |dr\ |ms\ |mr\. |mrs\. |dr\. |ms\.)+//','',1) as first_name
from synthetic_dataset;
select *
from synthetic_dataset;
select *
from clean_data;