MongoDB使用Int型TImestamp记录时间数据有更快的排序速度
- Talk is cheap, show you my code.
import mongoengine
import datetime
import os
import random
from threading import Thread
from mongoengine import (
StringField,
IntField,
DateTimeField,
FloatField,
Document
)
options = {'allowDiskUse': True}
MONGO_HOST = os.environ.get('MONGO_HOST')
if MONGO_HOST is None:
MONGO_HOST = None
MONGO_USER = os.environ.get('MONGO_USER')
if MONGO_USER is None:
MONGO_USER = None
MONGO_PASSWD = os.environ.get('MONGO_PASSWD')
if MONGO_PASSWD is None:
MONGO_PASSWD = None
print(MONGO_HOST, MONGO_USER, MONGO_PASSWD)
mongoengine.connect('testing_date', host=MONGO_HOST, port=27017, username=MONGO_USER, password=MONGO_PASSWD)
class DtData(Document):
name = StringField()
the_id = IntField()
date = DateTimeField()
class TsDataFloat(Document):
name = StringField()
the_id = IntField()
date = FloatField()
class TsDataInt(Document):
name = StringField()
the_id = IntField()
date = IntField()
def random_str():
"""
生成一个指定长度的随机字符串
"""
randomlength=16
random_str =''
base_str ='ABCDEFGHIGKLMNOPQRSTUVWXYZabcdefghigklmnopqrstuvwxyz0123456789'
length =len(base_str) -1
for i in range(randomlength):
random_str +=base_str[random.randint(0, length)]
return random_str
def insert_dt():
for i in range(100 * 10000):
random_int = random.randint(1, 100000000)
date = datetime.datetime.utcnow()
new = DtData(name=random_str(), the_id=random_int, date=date)
new.save()
print("dt:",i)
def insert_tsf():
for i in range(100 * 10000):
random_int = random.randint(1, 100000000)
date = datetime.datetime.utcnow().timestamp()
new = TsDataFloat(name=random_str(), the_id=random_int, date=date)
new.save()
print("tsf:",i)
def insert_tsi():
for i in range(100 * 10000):
random_int = random.randint(1, 100000000)
date = int(datetime.datetime.utcnow().timestamp())
new = TsDataInt(name=random_str(), the_id=random_int, date=date)
new.save()
print("tsi:",i)
def query_dt():
pipeline = [
{
"$sort" : {
"date" : -1
}
}
]
dt_start = datetime.datetime.utcnow()
res_dt = DtData.objects.aggregate(pipeline, **options)
dt_stop = datetime.datetime.utcnow()
return (dt_stop - dt_start).total_seconds()
def query_tsf():
pipeline = [
{
"$sort" : {
"date" : -1
}
}
]
ts_start = datetime.datetime.utcnow()
res_ts = TsDataFloat.objects.aggregate(pipeline, **options)
ts_stop = datetime.datetime.utcnow()
return (ts_stop - ts_start).total_seconds()
def query_tsi():
pipeline = [
{
"$sort" : {
"date" : -1
}
}
]
ts_start = datetime.datetime.utcnow()
res_int_ts = TsDataInt.objects.aggregate(pipeline, **options)
ts_stop = datetime.datetime.utcnow()
return (ts_stop - ts_start).total_seconds()
option_input = input("c : [create]\nq : [query]\nOptions:")
if option_input == "c":
task_dt = Thread(target=insert_dt)
task_ts = Thread(target=insert_tsf)
task_int_ts = Thread(target=insert_tsi)
task_dt.start()
task_ts.start()
task_int_ts.start()
elif option_input == "q":
times = 100
dt_perf = []
tsf_perf = []
tsi_perf = []
for i in range(times):
print("time", i)
dt_res = query_dt()
tsf_res = query_tsf()
tsi_res = query_tsi()
dt_perf.append(dt_res)
tsf_perf.append(tsf_res)
tsi_perf.append(tsi_res)
dt_avg = sum(dt_perf) / len(dt_perf)
tsf_avg = sum(tsf_perf) / len(tsf_perf)
tsi_avg = sum(tsi_perf) / len(tsi_perf)
dt_std = (sum([(i - dt_avg) ** 2 for i in dt_perf]) / len(dt_perf)) ** 0.5
tsf_std = (sum([(i - tsf_avg) ** 2 for i in tsf_perf]) / len(tsf_perf)) ** 0.5
tsi_std = (sum([(i - tsi_avg) ** 2 for i in tsi_perf]) / len(tsi_perf)) ** 0.5
dt_var = dt_std ** 2
tsf_var = tsf_std ** 2
tsi_var = tsi_std ** 2
dt_cov = sum([(dt_perf[i] - dt_avg) * (tsf_perf[i] - tsf_avg) for i in range(len(dt_perf))]) / len(dt_perf)
tsf_cov = sum([(tsf_perf[i] - tsf_avg) * (tsi_perf[i] - tsi_avg) for i in range(len(tsf_perf))]) / len(tsf_perf)
tsi_cov = sum([(tsi_perf[i] - tsi_avg) * (dt_perf[i] - dt_avg) for i in range(len(tsi_perf))]) / len(tsi_perf)
dt_worst = sorted(dt_perf, reverse=True)[:10]
tsf_worst = sorted(tsf_perf, reverse=True)[:10]
tsi_worst = sorted(tsi_perf, reverse=True)[:10]
dt_best = sorted(dt_perf)[:10]
tsf_best = sorted(tsf_perf)[:10]
tsi_best = sorted(tsi_perf)[:10]
print("----------平均值------------")
print("datetime:", dt_avg)
print("ts float:", tsf_avg)
print("ts int :", tsi_avg)
print("-----------方差-----------")
print("datetime:", dt_var)
print("ts float:", tsf_var)
print("ts int :", tsi_var)
print("-----------协方差-----------")
print("datetime:", dt_cov)
print("ts float:", tsf_cov)
print("ts int :", tsi_cov)
print("||||||||||||||||||||||")
print("||||||||||||||||||||||")
print("----------最坏情况------------")
print("datetime:", dt_worst)
print("ts float:", tsf_worst)
print("ts int :", tsi_worst)
print("----------最好情况------------")
print("datetime:", dt_best)
print("ts float:", tsf_best)
print("ts int :", tsi_best)
- 我得到的结果:
----------平均值------------
datetime: 0.49624905000000014
ts float: 0.50869789
ts int : 0.38799239
-----------方差-----------
datetime: 0.0001654481014074999
ts float: 0.0001566129328779001
ts int : 0.0001354024412778999
-----------协方差-----------
datetime: 4.3927028885500086e-05
ts float: 4.275914065290002e-05
ts int : 4.900592647049995e-05
||||||||||||||||||||||
||||||||||||||||||||||
----------最坏情况------------
datetime: [0.546375, 0.52904, 0.522622, 0.519766, 0.519098, 0.517002, 0.515263, 0.514543, 0.512385, 0.51231]
ts float: [0.534344, 0.532332, 0.530186, 0.530127, 0.529696, 0.528711, 0.527799, 0.527282, 0.526763, 0.526274]
ts int : [0.427955, 0.416631, 0.41039, 0.406159, 0.405257, 0.405068, 0.404763, 0.404012, 0.403814, 0.402175]
----------最好情况------------
datetime: [0.477207, 0.477904, 0.47898, 0.479608, 0.479876, 0.479951, 0.480098, 0.480977, 0.481021, 0.481765]
ts float: [0.486213, 0.488016, 0.488476, 0.4894, 0.489478, 0.489946, 0.490292, 0.490379, 0.491225, 0.492103]
ts int : [0.368757, 0.369022, 0.370764, 0.371021, 0.372055, 0.372325, 0.372414, 0.372813, 0.372818, 0.373015]
- 可以看到的是Float类型的日期字段与Datetime的差距不大,但是Int型的日期字段具有大约22%的性能差距(在百万级数据量的基础上),Int型的TImestamp的精度在秒级,小数部分精度为毫秒。大部分场景下,用Int型的的日期字段会更有性能优势。