Last active
April 14, 2021 05:45
-
-
Save tkm2261/1b3c3c37753e55ed2914577c0f96d222 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- Feature engineering BigQuery SQL queries for the kaggle talkingdata competition by tkm2261 | |
-- it may acheve 0.9823 on the public LB with simple GBDT. | |
-- destination table: takling.test2 | |
SELECT | |
CASE WHEN t.click_id is null THEN -1 ELSE t.click_id END as click_id, | |
o.* | |
FROM | |
`talking.test_supplement` as o | |
LEFT OUTER JOIN | |
`talking.test` as t | |
ON | |
o.ip = t.ip AND | |
o.app = t.app AND | |
o.device = t.device AND | |
o.os = t.os AND | |
o.channel = t.channel AND | |
o.click_time = t.click_time | |
-- destination table: takling.train_test | |
SELECT | |
null as click_id, | |
ip, app, device, os, channel, click_time, attributed_time, is_attributed, | |
EXTRACT(year from click_time) as year, | |
EXTRACT(month from click_time) as month, | |
EXTRACT(day from click_time) as day, | |
EXTRACT(DAYOFWEEK from o.click_time) as dayofweek, | |
EXTRACT(HOUR from click_time) as hour, | |
EXTRACT(MINUTE from.click_time) as minute, | |
EXTRACT(SECOND from click_time) as second | |
FROM | |
`talking.train` | |
UNION ALL | |
SELECT | |
click_id, | |
ip, app, device, os, channel, click_time, null as attributed_time, null as is_attributed, | |
EXTRACT(year from click_time) as year, | |
EXTRACT(month from click_time) as month, | |
EXTRACT(day from click_time) as day, | |
EXTRACT(DAYOFWEEK from o.click_time) as dayofweek, | |
EXTRACT(HOUR from click_time) as hour, | |
EXTRACT(MINUTE from.click_time) as minute, | |
EXTRACT(SECOND from click_time) as second | |
FROM | |
`talking.test2` | |
-- destination table: takling.mst_dayhouripos | |
SELECT | |
day, hour, ip, os, count(1) cnt, | |
TIMESTAMP_DIFF(MAX(click_time), MIN(click_time), SECOND) as diff | |
FROM `talking.train_test` | |
GROUP BY day, hour, ip, os | |
-- destination table: takling.mst_dayiphourapp | |
SELECT | |
day, hour, ip, count(1) cnt, | |
TIMESTAMP_DIFF(MAX(click_time), MIN(click_time), SECOND) as diff, | |
count(distinct hour) uq_hour, | |
count(distinct channel) uq_channel, | |
count(distinct app) uq_app, | |
count(distinct device) uq_device | |
FROM `talking.train_test` | |
GROUP BY day, hour, ip | |
-- destination table: takling.mst_dayhourdevice | |
SELECT | |
day, hour, device, count(1) cnt, | |
TIMESTAMP_DIFF(MAX(click_time), MIN(click_time), SECOND) as diff | |
FROM `talking.train_test` | |
GROUP BY day, hour, device | |
-- destination table: takling.mst_app | |
SELECT | |
app, | |
count(1) as cnt, | |
count(distinct channel) uq_channel | |
FROM `talking.train_test` | |
GROUP BY app | |
-- destination table: takling.mst_ip | |
SELECT | |
ip, | |
count(1) cnt_ip, | |
count(distinct channel) uq_channel, | |
count(distinct app) uq_app, | |
count(distinct device) uq_device, | |
count(distinct os) uq_os, | |
STDDEV(UNIX_MICROS(click_time) / 1000) std_ip_time, | |
STDDEV(hour) std_ip_hour, | |
STDDEV(minute) std_ip_minute, | |
STDDEV(second) std_ip_second | |
FROM `talking.train_test` | |
GROUP BY ip | |
-- destination table: takling.uq_app_ipdevice | |
SELECT | |
t.ip, t.device, count(distinct t.app) as uq_app_ipdevice, count(1) as cnt_app_ipdevice | |
FROM `talking.train_test` as t | |
GROUP BY t.ip, t.device | |
-- destination table: talking.uq_channel_iposdevice | |
SELECT | |
t.ip, t.os, t.device, count(distinct t.channel) as uq_channel_iposdevice, count(1) as cnt_channel_iposdevice | |
FROM `talking2.train_test` as t | |
GROUP BY t.ip, t.os, t.device | |
-- destination table: takling.train_test2 | |
SELECT | |
t.click_id, | |
t.is_attributed , | |
t.day, | |
t.hour, | |
t.minute, | |
t.second, | |
t.ip, -- please omit ip when training. | |
t.os, | |
t.app, | |
t.channel, | |
t.device, | |
ROW_NUMBER() OVER(partition by t.ip, t.day, t.hour order by click_time) as row_ip, | |
ROW_NUMBER() OVER(partition by t.ip, t.day, t.hour order by click_time desc) as row_ip_r, | |
ROW_NUMBER() OVER(partition by t.ip, t.day, t.hour, t.os, t.app, t.device order by click_time) as row_no_channel, | |
ROW_NUMBER() OVER(partition by t.ip, t.day, t.hour, t.os, t.app, t.device order by click_time desc) as row_no_channel_r, | |
uq_app_ipdevice, | |
uq_channel_iposdevice, | |
cnt_app_ipdevice, | |
cnt_channel_iposdevice, | |
TIMESTAMP_DIFF(click_time, LAG(click_time, 1) OVER(partition by t.ip, t.day, t.app order by click_time desc), SECOND) as nextclick_2, | |
TIMESTAMP_DIFF(click_time, LAG(click_time, 1) OVER(partition by t.ip, t.day, t.os order by click_time desc), SECOND) as nextclick_4, | |
TIMESTAMP_DIFF(click_time, LAG(click_time, 1) OVER(partition by t.ip, t.day, t.app, t.os, t.device order by click_time desc), SECOND) as nextclick_13, | |
TIMESTAMP_DIFF(click_time, LAG(click_time, 2) OVER(partition by t.ip, t.day, t.app order by click_time desc), SECOND) as nextclick_2_2, | |
TIMESTAMP_DIFF(click_time, LAG(click_time, 1) OVER(partition by t.ip, t.day, t.app order by click_time), SECOND) as nextclick_2_b, | |
TIMESTAMP_DIFF(click_time, LAG(click_time, 1) OVER(partition by t.ip, t.day, t.os order by click_time), SECOND) as nextclick_4_b, | |
TIMESTAMP_DIFF(click_time, LAG(click_time, 1) OVER(partition by t.ip, t.day, t.channel order by click_time), SECOND) as nextclick_6_b, | |
TIMESTAMP_DIFF(click_time, LAG(click_time, 1) OVER(partition by t.ip, t.app order by click_time desc), SECOND) as nextclick_2_n, | |
TIMESTAMP_DIFF(click_time, LAG(click_time, 1) OVER(partition by t.ip, t.os order by click_time desc), SECOND) as nextclick_4_n, | |
TIMESTAMP_DIFF(click_time, LAG(click_time, 1) OVER(partition by t.ip, t.channel order by click_time desc), SECOND) as nextclick_6_n, | |
TIMESTAMP_DIFF(click_time, LAG(click_time, 1) OVER(partition by t.ip, t.app, t.os order by click_time desc), SECOND) as nextclick_7_n, | |
TIMESTAMP_DIFF(click_time, LAG(click_time, 1) OVER(partition by t.ip, t.app, t.os, t.device order by click_time desc), SECOND) as nextclick_13_n, | |
TIMESTAMP_DIFF(click_time, LAG(click_time, 2) OVER(partition by t.ip, t.app, t.os, t.device order by click_time desc), SECOND) as nextclick_13_n2, | |
TIMESTAMP_DIFF(click_time, LAG(click_time, 3) OVER(partition by t.ip, t.app, t.os, t.device order by click_time desc), SECOND) as nextclick_13_n3, | |
STDDEV(UNIX_MICROS(click_time) / 1000) OVER(partition by t.ip, t.app) as stdtime_1, | |
STDDEV(UNIX_MICROS(click_time) / 1000) OVER(partition by t.ip, t.os) as stdtime_2, | |
STDDEV(UNIX_MICROS(click_time) / 1000) OVER(partition by t.ip, t.channel) as stdtime_3, | |
TIMESTAMP_DIFF(MAX(click_time) OVER(partition by t.ip, t.os), MIN(click_time) OVER(partition by t.ip, t.os), SECOND) as dur_2, | |
TIMESTAMP_DIFF(MAX(click_time) OVER(partition by t.ip, t.channel), MIN(click_time) OVER(partition by t.ip, t.channel), SECOND) as dur_3, | |
STDDEV(t.day) OVER(partition by t.ip, t.app, t.channel) std_day_ipappchannel, | |
STDDEV(t.hour) OVER(partition by t.ip, t.day ) as std_1, | |
STDDEV(t.hour) OVER(partition by t.ip, t.day, t.app ) as std_3, | |
STDDEV(t.hour) OVER(partition by t.ip, t.day, t.channel ) as std_ipdaychannel, | |
STDDEV(t.hour) OVER(partition by t.ip, t.day, t.os, t.device ) as std_8, | |
a.cnt as cnt_dayiphourapp, | |
dd.cnt as cnt_dayhourdevice, | |
ma.cnt as cnt_ma, | |
o.diff as diff_dayhouripos, | |
mi.cnt_ip, | |
mi.uq_channel as uq_channel_ip, | |
mi.uq_app as uq_app_ip, | |
mi.uq_device as uq_device_ip, | |
mi.uq_os as uq_os_ip, | |
mi.std_ip_time, | |
mi.std_ip_hour, | |
mi.std_ip_minute, | |
mi.std_ip_second | |
FROM | |
`talking.train_test` as t | |
LEFT OUTER JOIN `talking.mst_dayiphourapp` as a | |
ON a.day = t.day and a.hour = t.hour and a.ip = t.ip | |
LEFT OUTER JOIN `talking.mst_dayhouripos` as o | |
ON o.day = t.day and o.hour = t.hour and o.ip = t.ip and o.os = t.os | |
-- | |
LEFT OUTER JOIN `talking.mst_dayhourdevice` as dd | |
ON dd.day = t.day and dd.hour = t.hour and dd.device = t.device | |
-- | |
LEFT OUTER JOIN `talking.mst_app` as ma | |
ON ma.app = t.app | |
-- | |
LEFT OUTER JOIN `talking.mst_ip` as mi | |
ON mi.ip = t.ip | |
-- | |
LEFT OUTER JOIN `talking.uq_app_ipdevice` as uai | |
ON uai.ip = t.ip and uai.device = t.device | |
LEFT OUTER JOIN `talking.uq_channel_iposdevice` as ucd | |
ON ucd.ip = t.ip and ucd.os = t.os and ucd.device = t.device | |
-- destination table: takling.stdd_1 | |
SELECT | |
ip, STDDEV(hour) stdd_1, AVG(hour) avgd_1, STDDEV(day) stdday_1 | |
FROM | |
`talking.train_test` | |
GROUP BY ip | |
-- destination table: takling.stdd_3 | |
SELECT | |
ip, app, STDDEV(hour) stdd_3, AVG(hour) avgd_3, STDDEV(day) stdday_3 | |
FROM | |
`talking.train_test` | |
GROUP BY ip, app | |
-- destination table: takling.stdd_4 | |
SELECT | |
ip, channel, STDDEV(hour) stdd_4, AVG(hour) avgd_4, STDDEV(day) stdday_4 | |
FROM | |
`talking.train_test` | |
GROUP BY ip, channel | |
-- destination table: takling.datamart | |
SELECT | |
t.*, | |
count(1) OVER(partition by t.ip, t.day) as cntt_ip, | |
a.stdd_1, stdd_3, stdd_4, | |
STDDEV(t.hour) OVER(partition by t.ip, t.os, t.app, t.channel ) as stdd_15 | |
FROM | |
`talking.train_test2` as t | |
LEFT OUTER JOIN `talking.stdd_1` as a | |
ON a.ip = t.ip | |
LEFT OUTER JOIN `talking.stdd_3` as c | |
ON c.ip = t.ip and c.app = t.app | |
LEFT OUTER JOIN `talking.stdd_4` as d | |
ON d.ip = t.ip and d.channel = t.channel | |
-- destination table: takling.train_data | |
SELECT | |
* | |
FROM | |
`talking4.datamart` | |
WHERE | |
day <= 8 | |
-- destination table: takling.valid_data | |
SELECT | |
* | |
FROM | |
`talking4.datamart` | |
WHERE | |
day = 9 | |
-- destination table: takling.test_data | |
SELECT | |
* | |
FROM | |
`talking.datamart` | |
WHERE | |
day = 10 AND click_id >= 0 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment