Day2 :
Load the order.txt
create the table
order by - full sorting of data 1 reducer
select count from table2 order by count
https://drive.google.com/file/d/1abNo-jsy_l_Xo0krBoHpW3tkpSukr3Lp/view
order by will have do global sorting and it will use only one reducer.
We will run order by , sort by , cluster by and distribute by which are related to sorting
aa,1
bb,1
dd,5
ef,2
teh,1
CREATE TABLE IF NOT EXISTS table2(
name string,
count int
) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS TEXTFILE
load data local inpath '/Users/basan/order.txt' into table table2
select count from table2 order by count
order by will take only 1 reducer , independent of how many reducers we throw
____ ___ ___ ____
//set explicitly 2 reducers
SET mapreduce.job.reduces=2
select count from table2 sort by count
it will produce 2 different data sets
sort by considers number of reducers and sorting happen in the respective reducer.
It will have
______ ___ ___
group by will also use single reducer
_______
distribute by - will send data to specific reducer
SET mapreduce.job.reduces=2
select count from table2 distribute by count sort by count
It will make sure there is no overlap of data
Ideally we should use distribute by and then sort by to be used
By doing this we can force the 2 number of reducers
____
cluster by is same as = distribute by count sort by count
select count from table2 cluster by count;
Rankings
create table using rankfunctions.txt
John,1500
Albert,1500
Mark,1000
Frank,1150
Frank,1700
CREATE TABLE IF NOT EXISTS rankchcektable(
col1 string,
col2 int
) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' line terminated by '\n' STORED AS TEXTFILE
load data local inpath '/Users/basan/rankfunctions.txt' into table rankchcektable
//overwrite the old data
load data local inpath '/Users/basan/rankfunctions.txt' overwrite into table rankchcektable
select col1, col2, rank() over (order by col2 desc) as ranking from table2
gives tanking, ties are assigned the same rank with the next ranking skipped
Mark,1000 1
Frank,1150 2
John,1500 3
Albert,1500 3
Frank,1700 5
Difference between rank, dense rank and rownum
select col1, col2, dense_rank() over (order by col2 desc) as ranking from table2
Ties are assigned same rank,Ranks are not skipped.
Mark,1000 1
Frank,1150 2
John,1500 3
Albert,1500 3
Frank,1700 4
____
row number : To find top n rankings it will be used
select col1, col2, row_number() over (order by col2 desc) as ranking from table2
Mark,1000 1
Frank,1150 2
John,1500 3
Albert,1500 4
Frank,1700 5
So rank assigned will be unique
____
select col1, col2, row_number() over (partition by col1 order by col2 desc) as ranking from table2
First it does grouping then assigns ranking
use case is top 2 spend per person.
Albert 1500 1
Bhut 800 1
lesa 1500 1
lesa 900 2
Load the order.txt
create the table
order by - full sorting of data 1 reducer
select count from table2 order by count
https://drive.google.com/file/d/1abNo-jsy_l_Xo0krBoHpW3tkpSukr3Lp/view
order by will have do global sorting and it will use only one reducer.
We will run order by , sort by , cluster by and distribute by which are related to sorting
aa,1
bb,1
dd,5
ef,2
teh,1
CREATE TABLE IF NOT EXISTS table2(
name string,
count int
) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS TEXTFILE
load data local inpath '/Users/basan/order.txt' into table table2
select count from table2 order by count
order by will take only 1 reducer , independent of how many reducers we throw
____ ___ ___ ____
//set explicitly 2 reducers
SET mapreduce.job.reduces=2
select count from table2 sort by count
it will produce 2 different data sets
sort by considers number of reducers and sorting happen in the respective reducer.
It will have
______ ___ ___
group by will also use single reducer
_______
distribute by - will send data to specific reducer
SET mapreduce.job.reduces=2
select count from table2 distribute by count sort by count
It will make sure there is no overlap of data
Ideally we should use distribute by and then sort by to be used
By doing this we can force the 2 number of reducers
____
cluster by is same as = distribute by count sort by count
select count from table2 cluster by count;
Rankings
create table using rankfunctions.txt
John,1500
Albert,1500
Mark,1000
Frank,1150
Frank,1700
CREATE TABLE IF NOT EXISTS rankchcektable(
col1 string,
col2 int
) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' line terminated by '\n' STORED AS TEXTFILE
load data local inpath '/Users/basan/rankfunctions.txt' into table rankchcektable
//overwrite the old data
load data local inpath '/Users/basan/rankfunctions.txt' overwrite into table rankchcektable
select col1, col2, rank() over (order by col2 desc) as ranking from table2
gives tanking, ties are assigned the same rank with the next ranking skipped
Mark,1000 1
Frank,1150 2
John,1500 3
Albert,1500 3
Frank,1700 5
Difference between rank, dense rank and rownum
select col1, col2, dense_rank() over (order by col2 desc) as ranking from table2
Ties are assigned same rank,Ranks are not skipped.
Mark,1000 1
Frank,1150 2
John,1500 3
Albert,1500 3
Frank,1700 4
____
row number : To find top n rankings it will be used
select col1, col2, row_number() over (order by col2 desc) as ranking from table2
Mark,1000 1
Frank,1150 2
John,1500 3
Albert,1500 4
Frank,1700 5
So rank assigned will be unique
____
select col1, col2, row_number() over (partition by col1 order by col2 desc) as ranking from table2
First it does grouping then assigns ranking
use case is top 2 spend per person.
Albert 1500 1
Bhut 800 1
lesa 1500 1
lesa 900 2
No comments:
Post a Comment