Dear SCN community,
We are building a predictive model in HANA PAL using the SVM algorithm. The sample code was taken from the HANA PAL Guide and adjusted according to our dataset. This meant changing the schema name and importing the data set manually into the input data table. Our training dataset (TRAIN) contains 798 instances and the test data (TEST) set contains 202 instances. There are 19 independent variables and 1 dependent variable.
The model was created successfully but the predictions made on test data are wrong. The model predicts a value of 0 for all the 202 instances which is incorrect. We have created the same model in R and it worked fine with a prediction accuracy of about 78%. Therefore our questions now are:
- Is the parameter description suitable for our data? Please find attached the SQL code below.
- Should the target variable ‘class2’ be imported as an INTEGER or a DOUBLE? The target variable has only values 0 and 1.
- How can we find out the probability of the target variable being 0 or 1? This is explained for other algorithms but not for SVM.
I have attached the training and testing data that I am using below. I would be very grateful for any help!!
Thank you for your time and help!
SQL CODE –
SETSCHEMA DE001_D14_710;
--prepare input training data table type--
--DROP TYPE PAL_SVM_TRAININGSET_T;
CREATETYPE PAL_SVM_TRAININGSET_T ASTABLE (ID integer, class2 double,
credit_usage integer,
own_telephone varchar (100),
existing_credits varchar (100),
other_payment_plans varchar (100),
property_magnitude varchar (100),
residence_since integer,
personal_status varchar (100),
other_parties varchar (100),
purpose varchar (100),
credit_history varchar (100),
over_draft varchar (100),
current_balance integer,
Average_Credit_Balance varchar (100),
employment varchar (100),
cc_age integer,
housing varchar (100),
job varchar (100),
num_dependents integer
);
--prepare argument table type--
--DROP TYPE PAL_CONTROL_T;
CREATETYPE PAL_CONTROL_T ASTABLE( NAME varchar(50), INT#PAL_CONTROL_TBL
integer, DOUBLE#PAL_CONTROL_TBL double, STRING#PAL_CONTROL_TBL varchar(100));
--prepare result table type--
--DROP TYPE PAL_SVM_MODELPART1_T;
CREATETYPE PAL_SVM_MODELPART1_T ASTABLE( ID varchar(50), VALUEE double);
--prepare result table type--
--DROP TYPE PAL_SVM_MODELPART2_T;
CREATETYPE PAL_SVM_MODELPART2_T ASTABLE( ID integer, class2 double,
credit_usage integer,
own_telephone varchar (100),
existing_credits varchar (100),
other_payment_plans varchar (100),
property_magnitude varchar (100),
residence_since integer,
personal_status varchar (100),
other_parties varchar (100),
purpose varchar (100),
credit_history varchar (100),
over_draft varchar (100),
current_balance integer,
Average_Credit_Balance varchar (100),
employment varchar (100),
cc_age integer,
housing varchar (100),
job varchar (100),
num_dependents integer
);
--DROP TYPE PAL_SVM_MODELPART3_T;
CREATETYPE PAL_SVM_MODELPART3_T ASTABLE( ID integer, MAPSTRING varchar(100),
MAPPOSITION integer);
----create PAL procedure for training----
--DROP TABLE PAL_SVM_PDATA_TBL;
CREATETABLE PAL_SVM_PDATA_TBL("POSITION"INT, "SCHEMA_NAME"NVARCHAR(256),
"TYPE_NAME"NVARCHAR(256), "PARAMETER_TYPE"VARCHAR(7));
INSERTINTO PAL_SVM_PDATA_TBL VALUES (1,'DE001_D14_710','PAL_SVM_TRAININGSET_T','IN');
INSERTINTO PAL_SVM_PDATA_TBL VALUES (2,'DE001_D14_710','PAL_CONTROL_T','IN');
INSERTINTO PAL_SVM_PDATA_TBL VALUES (3,'DE001_D14_710','PAL_SVM_MODELPART1_T','OUT');
INSERTINTO PAL_SVM_PDATA_TBL VALUES (4,'DE001_D14_710','PAL_SVM_MODELPART2_T','OUT');
INSERTINTO PAL_SVM_PDATA_TBL VALUES (5,'DE001_D14_710','PAL_SVM_MODELPART3_T','OUT');
--call SYS.AFLLANG_WRAPPER_PROCEDURE_DROP('DE001_D14_710','PAL_SVM_TRAIN');
call SYS.AFLLANG_WRAPPER_PROCEDURE_CREATE('AFLPAL','SVMTRAIN','DE001_D14_710','PAL_SVM_TRAIN',PAL_SVM_PDATA_TBL);
--create input training data table--
--DROP TABLE PAL_SVM_TRAININGSET_TBL;
CREATECOLUMNTABLE PAL_SVM_TRAININGSET_TBL LIKE PAL_SVM_TRAININGSET_T
----- We manually import the training data in the above created empty column table using the interface (TRAIN.CSV)
--DROP TABLE #PAL_CONTROL_TBL;
CREATELOCALTEMPORARYCOLUMNTABLE #PAL_CONTROL_TBL (NAME varchar(50),
INT#PAL_CONTROL_TBL integer, DOUBLE#PAL_CONTROL_TBL double,
STRING#PAL_CONTROL_TBL varchar(100));
--create model part 1 table--
--DROP TABLE PAL_SVM_MODELPART1_TBL;
CREATECOLUMNTABLE PAL_SVM_MODELPART1_TBL( ID varchar(50), VALUEE double);
--create model part 2 table--
--DROP TABLE PAL_SVM_MODELPART2_TBL;
CREATECOLUMNTABLE PAL_SVM_MODELPART2_TBL( ID integer, ALPHA double,
credit_usage integer,
own_telephone varchar (100),
existing_credits varchar (100),
other_payment_plans varchar (100),
property_magnitude varchar (100),
residence_since integer,
personal_status varchar (100),
other_parties varchar (100),
purpose varchar (100),
credit_history varchar (100),
over_draft varchar (100),
current_balance integer,
Average_Credit_Balance varchar (100),
employment varchar (100),
cc_age integer,
housing varchar (100),
job varchar (100),
num_dependents integer
);
--create model part 3 table--
--DROP TABLE PAL_SVM_MODELPART3_TBL;
CREATECOLUMNTABLE PAL_SVM_MODELPART3_TBL(ID integer, MAPSTRING varchar(100), MAPPOSITION integer);
---insert data into input training argument---
INSERTINTO #PAL_CONTROL_TBL VALUES('THREAD_NUMBER',8,null,null);
INSERTINTO #PAL_CONTROL_TBL VALUES('KERNEL_TYPE',2,null,null);
INSERTINTO #PAL_CONTROL_TBL VALUES('TYPE',1,null,null);
INSERTINTO #PAL_CONTROL_TBL VALUES('CROSS_VALIDATION',0,null,null);
INSERTINTO #PAL_CONTROL_TBL VALUES('NR_FOLD',5,null,null);
CALL DE001_D14_710.PAL_SVM_TRAIN(PAL_SVM_TRAININGSET_TBL,#PAL_CONTROL_TBL,PAL_SVM_MODELPART1_TBL,PAL_SVM_MODELPART2_TBL,PAL_SVM_MODELPART3_TBL) WITH OVERVIEW;
--check the result--
SELECT * FROM PAL_SVM_TRAININGSET_TBL;
SELECT * FROM #PAL_CONTROL_TBL;
SELECT * FROM PAL_SVM_MODELPART1_TBL;
SELECT * FROM PAL_SVM_MODELPART2_TBL;
SELECT * FROM PAL_SVM_MODELPART3_TBL;
--prepare input predicting test data table type--
--DROP TYPE PAL_SVM_TESTINGSET_T;
CREATETYPE PAL_SVM_TESTINGSET_T ASTABLE ( ID integer,
credit_usage integer,
own_telephone varchar (100),
existing_credits varchar (100),
other_payment_plans varchar (100),
property_magnitude varchar (100),
residence_since integer,
personal_status varchar (100),
other_parties varchar (100),
purpose varchar (100),
credit_history varchar (100),
over_draft varchar (100),
current_balance integer,
Average_Credit_Balance varchar (100),
employment varchar (100),
cc_age integer,
housing varchar (100),
job varchar (100),
num_dependents integer
);
--prepare argument table type--
--DROP TYPE PAL_CONTROL_T;
CREATETYPE PAL_CONTROL_T ASTABLE( NAME varchar(50), INT#PAL_CONTROL_TBL
integer, DOUBLE#PAL_CONTROL_TBL double, STRING#PAL_CONTROL_TBL varchar(100));
--prepare model part 1 table type--
--DROP TYPE PAL_SVM_MODELPART1_T;
CREATETYPE PAL_SVM_MODELPART1_T ASTABLE( ID varchar(50), VALUEE double);
--prepare model part 2 table type--
--DROP TYPE PAL_SVM_MODELPART2_T;
CREATETYPE PAL_SVM_MODELPART2_T ASTABLE( ID integer, ALPHA double,
credit_usage integer,
own_telephone varchar (100),
existing_credits varchar (100),
other_payment_plans varchar (100),
property_magnitude varchar (100),
residence_since integer,
personal_status varchar (100),
other_parties varchar (100),
purpose varchar (100),
credit_history varchar (100),
over_draft varchar (100),
current_balance integer,
Average_Credit_Balance varchar (100),
employment varchar (100),
cc_age integer,
housing varchar (100),
job varchar (100),
num_dependents integer
);
--prepare model part 3 table type--
--DROP TYPE PAL_SVM_MODELPART3_T;
CREATETYPE PAL_SVM_MODELPART3_T ASTABLE(ID integer, MAPSTRING varchar(100),
MAPPOSITION integer);
--prepare predicting result table type--
--DROP TYPE PAL_SVM_PREDICTION_T;
CREATETYPE PAL_SVM_PREDICTION_T ASTABLE( ID integer, PREDICT double);
----create PAL procedure for predicting----
--DROP TABLE PAL_SVM_PDATA_TBL;
CREATECOLUMNTABLE PAL_SVM_PDATA_TBL("POSITION"INT, "SCHEMA_NAME"
NVARCHAR(256), "TYPE_NAME"NVARCHAR(256), "PARAMETER_TYPE"VARCHAR(7));
INSERTINTO PAL_SVM_PDATA_TBL VALUES (1,'DE001_D14_710','PAL_SVM_TESTINGSET_T','IN');
INSERTINTO PAL_SVM_PDATA_TBL VALUES (2,'DE001_D14_710','PAL_CONTROL_T','IN');
INSERTINTO PAL_SVM_PDATA_TBL VALUES (3,'DE001_D14_710','PAL_SVM_MODELPART1_T','IN');
INSERTINTO PAL_SVM_PDATA_TBL VALUES (4,'DE001_D14_710','PAL_SVM_MODELPART2_T','IN');
INSERTINTO PAL_SVM_PDATA_TBL VALUES (5,'DE001_D14_710','PAL_SVM_MODELPART3_T','IN');
INSERTINTO PAL_SVM_PDATA_TBL VALUES (6,'DE001_D14_710','PAL_SVM_PREDICTION_T','OUT');
--call SYS.AFLLANG_WRAPPER_PROCEDURE_DROP('DE001_D14_710','PAL_SVM_PREDICT');
call SYS.AFLLANG_WRAPPER_PROCEDURE_CREATE('AFLPAL','SVMPREDICT','DE001_D14_710','PAL_SVM_PREDICT',PAL_SVM_PDATA_TBL);
--create input predicting test data table--
--DROP TABLE PAL_SVM_TESTINGSET_TBL;
CREATECOLUMNTABLE PAL_SVM_TESTINGSET_TBL LIKE PAL_SVM_TESTINGSET_T
----- We manually import the testing data in the above created empty column table using the interface (TEST.CSV)
--create predicting argument table--
--DROP TABLE #PAL_CONTROL_TBL;
CREATELOCALTEMPORARYCOLUMNTABLE #PAL_CONTROL_TBL (NAME varchar(50),
INT#PAL_CONTROL_TBL integer, DOUBLE#PAL_CONTROL_TBL double,
STRING#PAL_CONTROL_TBL varchar(100));
--create predicting result table--
--DROP TABLE PAL_SVM_PREDICTION_TBL;
CREATECOLUMNTABLE PAL_SVM_PREDICTION_TBL( ID integer, PREDICT double);
---insert data into input predicting argument---
INSERTINTO #PAL_CONTROL_TBL VALUES('THREAD_NUMBER',8,null,null);
CALL DE001_D14_710.PAL_SVM_PREDICT(PAL_SVM_TESTINGSET_TBL,#PAL_CONTROL_TBL,PAL_SVM_MODELPART1_TBL,PAL_SVM_MODELPART2_TBL,PAL_SVM_MODELPART3_TBL,PAL_SVM_PREDICTION_TBL) WITH OVERVIEW;
SELECT * FROM PAL_SVM_PREDICTION_TBL;