-
Notifications
You must be signed in to change notification settings - Fork 0
/
Scripts_for_CreditCardFraud.txt
147 lines (104 loc) · 7.78 KB
/
Scripts_for_CreditCardFraud.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
PRE-REQUISITES.
1. AWS Account
2. AWS EC2 machine RedHat linux OS. Installed with Hadoop, Hive, HDFS, Hbase, Spark, Spark Streaming
TASK1 : Log on to EC2 machine as ec2-user
Run the following commmands from /home/ec2-user
mkdir creditcardproject
cd creditcardproject
mkdir inpdata
mkdir src
cd src
mkdir db
mkdir rules
cd /home/ec2-user
TASK 2 : Copy the files from local to the EC2 machine
WinSCP the card_transactions.csv file into creditcardproject/inpdata/.
WinSCP the driver.py file into creditcardproject/src/
WinSCP the rules.py file into creditcardproject/src/db/
WinSCP the geo_map.py file into creditcardproject/src/db/
WinSCP the dao.py file into creditcardproject/src/db/
TASK 3 : HADOOP commands for directory creation.
hadoop fs -mkdir /user/ec2-user/CCFraudDetection
hadoop fs -mkdir /user/ec2-user/CCFraudDetection/input
TASK 4 : Loading Historical transaction data from EC2 machine into HDFS
hadoop fs -put /home/ec2-user/creditcardproject/inpdata/card_transactions.csv /user/ec2-user/CCFraudDetection/input/
TASK 5 : Create tables in HBase using HBase shell. Only required if you are creating external tables using Hive-Hbase Integration
hbase shell
create 'card_transactions_master', 'cardDetail', 'transactionDetail'
create 'card_transaction_lookup', 'cf'
TASK 6 : Create tables in Hive
#open hive shell
hive
create external table if not exists card_transactions_history_data (card_id BIGINT, member_id BIGINT, amount DOUBLE, postcode INT, pos_id BIGINT, transaction_dt STRING, status STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LOCATION '/user/ec2-user/CCFraudDetection/input/hive/' tblproperties("skip.header.line.count"="1");
load data inpath '/user/ec2-user/CCFraudDetection/input/card_transactions.csv' overwrite into table card_transactions_history_data;
select * from card_transactions_history_data LIMIT 10;
select * from card_transactions_hbase_master LIMIT 10;
select count(*) from card_transactions_history_data;
# Create table in Hbase through Hive. Only possible if not using external table. No need to create hbase table using hbase shell
create table if not exists card_transactions_hbase_master (rowid STRING, card_id BIGINT, member_id BIGINT, amount DOUBLE, postcode INT, pos_id BIGINT, transaction_dt STRING, status STRING) STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler' WITH SERDEPROPERTIES ("hbase.columns.mapping" = ":key, cardDetail:card_id, cardDetail:member_id, transactionDetail:amount, transactionDetail:postcode, transactionDetail:pos_id, transactionDetail:transaction_dt, transactionDetail:status") tblproperties("hbase.table.name"="card_transactions_master");
insert overwrite table card_transactions_hbase_master select regexp_replace(reflect('java.util.UUID','randomUUID'), '-', '') as rowid, card_id, member_id, amount, postcode, pos_id, transaction_dt, status from card_transactions_history_data;
TASK 7 : Ingest the relevant data from AWS into HDFS using Sqoop. Create Hive tables from HDFS data
hadoop fs -rm -r /user/ec2-user/CCFraudDetection/input/awsrdstables/card_member
hadoop fs -rm -r /user/ec2-user/CCFraudDetection/input/awsrdstables/member_score
hadoop fs -ls /user/ec2-user/CCFraudDetection/input/awsrdstables/card_member
hadoop fs -ls /user/ec2-user/CCFraudDetection/input/awsrdstables/member_score
sqoop import \
--connect jdbc:mysql://awsrds1.cyaielc9bmnf.us-east-1.rds.amazonaws.com/cred_financials_data \
--username <<USERNAME>> \
--password <<PASSWORD>> \
--table card_member \
--target-dir /user/ec2-user/CCFraudDetection/input/awsrdstables/card_member
sqoop import \
--connect jdbc:mysql://awsrds1.cyaielc9bmnf.us-east-1.rds.amazonaws.com/cred_financials_data \
--table member_score \
--username <<USERNAME>> \
--password <<PASSWORD>> \
--target-dir /user/ec2-user/CCFraudDetection/input/awsrdstables/member_score
hive
create external table if not exists card_member (card_id BIGINT, member_id BIGINT, member_joining_dt STRING, card_purchase_dt STRING, country STRING, city STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LOCATION '/user/ec2-user/CCFraudDetection/input/hive/card_member/';
create external table if not exists member_score (member_id BIGINT, score INT) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LOCATION '/user/ec2-user/CCFraudDetection/input/hive/member_score/';
load data inpath '/user/ec2-user/CCFraudDetection/input/awsrdstables/card_member/part*' overwrite into table card_member;
load data inpath '/user/ec2-user/CCFraudDetection/input/awsrdstables/member_score/part*' overwrite into table member_score;
TASK 8 : CREATE AND POPULATE LOOKUP TABLE
==========================================
create table if not exists card_transactions_lookup
(card_id BIGINT, ucl DOUBLE, postcode INT, transaction_dt STRING, score INT)
STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler' WITH SERDEPROPERTIES ("hbase.columns.mapping" = ":key,
cf:ucl, cf:postcode, cf:transaction_dt, cf:score")
tblproperties("hbase.table.name"="card_transaction_lookup");
TASK 8.1 CREATE STAGING TABLES in Hive
========================================
create external table if not exists card_score
(card_id BIGINT, score INT)
STORED as ORC
LOCATION '/user/ec2-user/CCFraudDetection/input/hive/card_score/'
tblproperties ("orc.compress"="SNAPPY");
create external table if not exists card_last_ten_transactions
(card_id BIGINT, amount DOUBLE, postcode INT, transaction_dt STRING, status STRING)
STORED as ORC
LOCATION '/user/ec2-user/CCFraudDetection/input/hive/card_last_ten_transactions/'
tblproperties ("orc.compress"="SNAPPY");
create external table if not exists card_ucl
(card_id BIGINT, ucl DOUBLE)
STORED as ORC
LOCATION '/user/ec2-user/CCFraudDetection/input/hive/card_ucl/'
tblproperties ("orc.compress"="SNAPPY");
create external table if not exists card_zipcode
(card_id BIGINT, postcode INT, transaction_dt STRING)
STORED as ORC
LOCATION '/user/ec2-user/CCFraudDetection/input/hive/card_zipcode/'
tblproperties ("orc.compress"="SNAPPY");
Task 8.2 LOAD DATA IN STAGING TABLES from HIVE. Calculate the UCL, last_ten_transactions, zipcode
==================================================================================================
insert overwrite table card_score select cm.card_id, ms.score from card_member cm inner join member_score ms on cm.member_id = ms.member_id;
select count(*) from card_score;
insert overwrite table card_last_ten_transactions select card_id,amount, postcode, transaction_dt, status from (select card_id, member_id, amount, postcode, pos_id, transaction_dt, status, ROW_NUMBER() OVER (PARTITION BY card_id ORDER BY unix_timestamp(transaction_dt,'yyyy-MM-dd hh:mm:ss') DESC) AS ROW_NUMBER from card_transactions_hbase_master where status = 'GENUINE') master where master.ROW_NUMBER <= 10;
select count(*) from card_last_ten_transactions;
insert overwrite table card_ucl select card_id, (MovingAverage+3*StandardDeviation) as UCL from (select card_id, AVG(amount) as MovingAverage, STDDEV(amount) as StandardDeviation from card_last_ten_transactions group by card_id) temp;
select count(*) from card_ucl ;
insert overwrite table card_zipcode select card_id, postcode, transaction_dt from (select card_id, postcode, transaction_dt, ROW_NUMBER() OVER (PARTITION BY card_id ORDER BY unix_timestamp(transaction_dt,'yyyy-MM-dd hh:mm:ss') DESC) AS ROW_NUMBER from card_last_ten_transactions) temp where temp.ROW_NUMBER <= 1;
select count(*) from card_zipcode ;
Task 8.3 POPULATE THE LOOKUP TABLE
====================================
insert overwrite table card_transactions_lookup select cs.card_id, cu.ucl, cz.postcode, cz.transaction_dt, cs.score from card_score cs inner join card_ucl cu on cs.card_id = cu.card_id inner join card_zipcode cz on cs.card_id = cz.card_id;
select count(*) from card_transactions_lookup ;