add file hdfs://home/user/py3_script/;
set spark.yarn.dist.archives=hdfs://home/user/py3.tar.gz;
set spark.shuffle.hdfs.enabled=true;
set spark.shuffle.io.maxRetries=1;
set spark.shuffle.io.retryWait=0s;
set spark.network.timeout=120s;
INSERT OVERWRITE TABLE seains.image_infos PARTITION (date = '${date}')
SELECT
TRANSFORM(*) USING 'py3.tar.gz/py3/bin/python py3_script/script.py' AS (
mid,
uri,
count
)
from
(
select
mid
sum(count)
from
log_hourly
where p_date = '${date}'
group by mid
)
import sys
total_mids = []
mid_infos = {}
for line in sys.stdin:
# line = "7321132368127836199\t66\t0\t"
arr = line.strip('\n').split('\t')
if len(arr) != 4:
continue
mid = arr[0]
total_mids.append(mid)
mid_infos[mid] = arr
if len(total_mids) >= get_batch:
res = get_feature(total_mids)
print("error", file=sys.stderr)
for mid in total_mids:
line = res[mid]
line = [str(x) for x in line]
print('\t'.join(line))
total_mids = []
mid_infos = {}
hive:创建自定义python UDF_hive python udf只需要两列-CSDN博客