mirror of
https://github.com/netfun2000/ip2region.git
synced 2026-02-27 09:44:31 +08:00
Standardized code and add comments
This commit is contained in:
@@ -10,8 +10,6 @@ ip2region xdb maker
|
||||
main.py [command] [command options]
|
||||
Command:
|
||||
gen generate the binary db file
|
||||
search binary xdb search test
|
||||
bench binary xdb bench test
|
||||
```
|
||||
|
||||
# `xdb` 数据生成
|
||||
@@ -35,56 +33,6 @@ options:
|
||||
```
|
||||
|
||||
|
||||
# `xdb` 数据查询
|
||||
# `xdb` 数据查询 和 bench 测试
|
||||
|
||||
通过 `python main.py search` 命令来测试查询输入的 ip:
|
||||
```
|
||||
➜ python git:(v2.0_xdb) ✗ python main.py search
|
||||
main.py search [command options]
|
||||
options:
|
||||
--db string ip2region binary xdb file path
|
||||
```
|
||||
|
||||
例如,使用自带的 xdb 文件来运行查询测试:
|
||||
```
|
||||
➜ python git:(v2.0_xdb) ✗ python main.py search --db=./ip2region.xdb
|
||||
ip2region xdb search test program, commands:
|
||||
loadIndex : load the vector index for search speedup.
|
||||
clearIndex: clear the vector index.
|
||||
quit : exit the test program
|
||||
ip2region>> 117.148.181.111
|
||||
[region:中国|0|浙江省|嘉兴市|移动, took:0s]
|
||||
ip2region>> 120.196.20.28
|
||||
[region:中国|0|广东省|茂名市|移动, took:0s]
|
||||
ip2region>> 81.33.22.150
|
||||
[region:西班牙|0|马德里|马德里|西班牙电信, took:0s]
|
||||
ip2region>>
|
||||
```
|
||||
|
||||
# bench 测试
|
||||
|
||||
如果你自主生成了 `xdb` 文件,请确保运行如下的 `python main.py bench` 命令来确保生成的的 `xdb` 文件的正确性:
|
||||
```
|
||||
➜ python git:(v2.0_xdb) ✗ python main.py bench
|
||||
main.py bench [command options]
|
||||
options:
|
||||
--db string ip2region binary xdb file path
|
||||
--src string source ip text file path
|
||||
--ignore-error bool keep going if bench failed
|
||||
```
|
||||
|
||||
例如:使用 data/ip.merge.txt 源文件来 bench 测试 data/ip2region.xdb 这个 xdb 文件:
|
||||
```
|
||||
➜ python git:(v2.0_xdb) ✗ python main.py bench --db=../../data/ip2region.xdb --src=../../data/ip.merge.txt
|
||||
# 会看到一堆输出,看到类似如下的数据表示 bench 测试通过了,否则就会报错
|
||||
...
|
||||
try to bench segment: `{}` 224.0.0.0|255.255.255.255|0|0|0|内网IP|内网IP
|
||||
|-try to bench ip '224.0.0.0' ... --[Ok]
|
||||
|-try to bench ip '231.255.255.255' ... --[Ok]
|
||||
|-try to bench ip '239.255.255.255' ... --[Ok]
|
||||
|-try to bench ip '247.255.255.255' ... --[Ok]
|
||||
|-try to bench ip '255.255.255.255' ... --[Ok]
|
||||
Bench finished, [count: 3417955, failed: 0, took: 88.061s]
|
||||
```
|
||||
*请注意 bench 测试使用的 `src` 文件需要是对应的生成 ip2region.xdb 的源文件相同*。
|
||||
如果运行过程中有错误会立马停止运行,也可以执行 --ignore-error=true 参数来忽略错误,在最后看 failed 的统计结果。
|
||||
基于xdb格式的查询功能和测试见 [ip2region binding](https://github.com/lionsoul2014/ip2region/tree/master/binding)
|
||||
@@ -1,34 +1,35 @@
|
||||
# Created by leolin49 on 2022/7/7.
|
||||
# Copyright (C) 2022 leolin49. All rights reserved.
|
||||
|
||||
# Copyright 2022 The Ip2Region Authors. All rights reserved.
|
||||
# Use of this source code is governed by a Apache2.0-style
|
||||
# license that can be found in the LICENSE file.
|
||||
#
|
||||
# Author: leolin49 <leolin49@foxmail.com>
|
||||
#
|
||||
import logging
|
||||
import sys
|
||||
import time
|
||||
from datetime import datetime
|
||||
|
||||
import xdb.maker as mk
|
||||
import xdb.index as idx
|
||||
import xdb.searcher as sc
|
||||
import xdb.util as util
|
||||
|
||||
# format log
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s-%(name)s-%(lineno)s-%(levelname)s - %(message)s')
|
||||
# Format log
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s-%(name)s-%(lineno)s-%(levelname)s - %(message)s",
|
||||
)
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def print_help():
|
||||
print("ip2region xdb maker")
|
||||
print("ip2region xdb python maker")
|
||||
print("{} [command] [command options]".format(sys.argv[0]))
|
||||
print("Command: ")
|
||||
print(" gen generate the binary db file")
|
||||
print(" search binary xdb search test")
|
||||
print(" bench binary xdb bench test")
|
||||
|
||||
|
||||
def gen_db():
|
||||
src_file, dst_file = "", ""
|
||||
index_policy = idx.VectorIndexPolicy
|
||||
# check input argv
|
||||
index_policy = idx.Vector_Index_Policy
|
||||
# Check input parameters
|
||||
for i in range(2, len(sys.argv)):
|
||||
r = sys.argv[i]
|
||||
if len(r) < 5:
|
||||
@@ -40,11 +41,11 @@ def gen_db():
|
||||
print("missing = for args pair '{}'".format(r))
|
||||
return
|
||||
if r[2:s_idx] == "src":
|
||||
src_file = r[s_idx+1:]
|
||||
src_file = r[s_idx + 1:]
|
||||
elif r[2:s_idx] == "dst":
|
||||
dst_file = r[s_idx+1:]
|
||||
dst_file = r[s_idx + 1:]
|
||||
elif r[2:s_idx] == "index":
|
||||
index_policy = idx.index_policy_from_string(r[s_idx+1:])
|
||||
index_policy = idx.index_policy_from_string(r[s_idx + 1:])
|
||||
else:
|
||||
print("undefined option `{}`".format(r))
|
||||
return
|
||||
@@ -56,142 +57,17 @@ def gen_db():
|
||||
return
|
||||
|
||||
start_time = time.time()
|
||||
# make the binary file
|
||||
# Make the binary file
|
||||
maker = mk.new_maker(index_policy, src_file, dst_file)
|
||||
maker.init()
|
||||
maker.start()
|
||||
maker.end()
|
||||
|
||||
logging.info("Done, elapsed: {:.0f}m{:.0f}s".format((time.time() - start_time) / 60, (time.time() - start_time) % 60))
|
||||
|
||||
|
||||
def test_search():
|
||||
db_file = ""
|
||||
for i in range(2, len(sys.argv)):
|
||||
r = sys.argv[i]
|
||||
if len(r) < 5:
|
||||
continue
|
||||
if not r.startswith("--"):
|
||||
continue
|
||||
e_idx = r.index("=")
|
||||
if e_idx < 0:
|
||||
print("missing = for args pair '{}'".format(r))
|
||||
return
|
||||
if r[2:e_idx] == "db":
|
||||
db_file = r[e_idx+1:]
|
||||
else:
|
||||
print("undefined option `{}`".format(r))
|
||||
return
|
||||
if db_file == "":
|
||||
print("{} search [command options]".format(sys.argv[0]))
|
||||
print("options:")
|
||||
print(" --db string ip2region binary xdb file path")
|
||||
return
|
||||
cb = sc.XdbSearcher.loadContentFromFile(dbfile=db_file)
|
||||
searcher = sc.XdbSearcher(contentBuff=cb)
|
||||
print("ip2region xdb search test program, commands:\nloadIndex : load the vector index for search "
|
||||
"speedup.\nclearIndex: clear the vector index.\nquit : exit the test program")
|
||||
while True:
|
||||
print("ip2region>> ", end="")
|
||||
line = input()
|
||||
|
||||
# command interception and execution
|
||||
if line == "loadIndex":
|
||||
searcher.loadVectorIndexFromFile(dbfile=db_file)
|
||||
print("vector index cached")
|
||||
continue
|
||||
elif line == "clearIndex":
|
||||
# FIXME need to add 'clearVectorIndex' method in searcher
|
||||
print("vector index cleared")
|
||||
continue
|
||||
elif line == "quit":
|
||||
break
|
||||
|
||||
ip = util.checkip(line)
|
||||
if ip == -1:
|
||||
print("invalid ip address `{}`".format(line))
|
||||
continue
|
||||
|
||||
s_tm = datetime.now()
|
||||
region = searcher.search(ip)
|
||||
# TODO calculate io count in `searcher.search` method
|
||||
print("\x1b[0;32m[region:{}, took:{:.0f}s]\x1b[0m".format(
|
||||
region, (datetime.now().microsecond - s_tm.microsecond) / 1000)
|
||||
logging.info(
|
||||
"Done, elapsed: {:.0f}m{:.0f}s".format(
|
||||
(time.time() - start_time) / 60, (time.time() - start_time) % 60
|
||||
)
|
||||
|
||||
|
||||
def test_bench():
|
||||
db_file, src_file = "", ""
|
||||
ignore_error = False
|
||||
for i in range(2, len(sys.argv)):
|
||||
r = sys.argv[i]
|
||||
if len(r) < 5:
|
||||
continue
|
||||
if not r.startswith("--"):
|
||||
continue
|
||||
s_idx = r.index("=")
|
||||
if s_idx < 0:
|
||||
print("missing = for args pair '{}'".format(r))
|
||||
return
|
||||
if r[2:s_idx] == "db":
|
||||
db_file = r[s_idx + 1:]
|
||||
elif r[2:s_idx] == "src":
|
||||
src_file = r[s_idx + 1:]
|
||||
elif r[2:s_idx] == "ignore-error":
|
||||
v = r[s_idx + 1:]
|
||||
if v == "true" or v == "1":
|
||||
ignore_error = True
|
||||
elif v == "false" or v == "0":
|
||||
ignore_error = False
|
||||
else:
|
||||
print("invalid value for ignore-error option, could be false/0 or true/1")
|
||||
return
|
||||
else:
|
||||
print("undefined option `{}`".format(r))
|
||||
return
|
||||
|
||||
if db_file == "" or src_file == "":
|
||||
print("{} bench [command options]".format(sys.argv[0]))
|
||||
print("options:")
|
||||
print(" --db string ip2region binary xdb file path")
|
||||
print(" --src string source ip text file path")
|
||||
print(" --ignore-error bool keep going if bench failed")
|
||||
return
|
||||
|
||||
cb = sc.XdbSearcher.loadContentFromFile(dbfile=db_file)
|
||||
searcher = sc.XdbSearcher(contentBuff=cb)
|
||||
cnt, err_cnt, s_tm = 0, 0, time.time()
|
||||
with open(src_file, 'r', encoding="utf-8") as f:
|
||||
lines = f.read().splitlines()
|
||||
for line in lines:
|
||||
ps = line.split("|", maxsplit=2)
|
||||
if len(ps) != 3:
|
||||
print("invalid ip segment line `{}`".format(line))
|
||||
return
|
||||
sip = util.checkip(ps[0])
|
||||
if sip == -1:
|
||||
print("invalid ip address `{}`".format(line))
|
||||
return
|
||||
eip = util.checkip(ps[1])
|
||||
if eip == -1:
|
||||
print("invalid ip address `{}`".format(line))
|
||||
return
|
||||
print("try to bench segment: `{}`", line)
|
||||
mip = util.mid_ip(sip, eip)
|
||||
for ip in [sip, util.mid_ip(sip, mip), mip, util.mid_ip(mip, eip), eip]:
|
||||
print("|-try to bench ip '{}' ...".format(util.long2ip(ip)), end="")
|
||||
region = searcher.search(ip)
|
||||
|
||||
# check the region info
|
||||
cnt += 1
|
||||
if region != ps[2]:
|
||||
err_cnt += 1
|
||||
print(" --[Failed] ({} != {})".format(region, ps[2]))
|
||||
if not ignore_error:
|
||||
return
|
||||
else:
|
||||
print(" --[Ok]")
|
||||
print("Bench finished, [count: {}, failed: {}, took: {:.3f}s]".format(cnt, err_cnt, time.time() - s_tm))
|
||||
)
|
||||
|
||||
|
||||
def main():
|
||||
@@ -202,13 +78,9 @@ def main():
|
||||
cmd = sys.argv[1].lower()
|
||||
if cmd == "gen":
|
||||
gen_db()
|
||||
elif cmd == "search":
|
||||
test_search()
|
||||
elif cmd == "bench":
|
||||
test_bench()
|
||||
else:
|
||||
print_help()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -1,2 +1,6 @@
|
||||
# Created by leolin49 on 2022/7/7.
|
||||
# Copyright (C) 2022 leolin49. All rights reserved.
|
||||
# Copyright 2022 The Ip2Region Authors. All rights reserved.
|
||||
# Use of this source code is governed by a Apache2.0-style
|
||||
# license that can be found in the LICENSE file.
|
||||
#
|
||||
# Author: leolin49 <leolin49@foxmail.com>
|
||||
#
|
||||
|
||||
@@ -1,21 +1,24 @@
|
||||
# Created by leolin49 on 2022/7/7.
|
||||
# Copyright (C) 2022 leolin49. All rights reserved.
|
||||
# Copyright 2022 The Ip2Region Authors. All rights reserved.
|
||||
# Use of this source code is governed by a Apache2.0-style
|
||||
# license that can be found in the LICENSE file.
|
||||
#
|
||||
# Author: leolin49 <leolin49@foxmail.com>
|
||||
#
|
||||
import struct
|
||||
|
||||
VectorIndexPolicy = 1
|
||||
BTreeIndexPolicy = 2
|
||||
SegmentIndexBlockSize = 14
|
||||
Vector_Index_Policy = 1
|
||||
BTree_Index_Policy = 2
|
||||
|
||||
|
||||
def index_policy_from_string(s: str) -> int:
|
||||
sl = s.lower()
|
||||
if sl == "vector":
|
||||
return VectorIndexPolicy
|
||||
return Vector_Index_Policy
|
||||
elif sl == "btree":
|
||||
return BTreeIndexPolicy
|
||||
return BTree_Index_Policy
|
||||
else:
|
||||
print("invalid policy `{}`, used default vector index".format(s))
|
||||
return VectorIndexPolicy
|
||||
return Vector_Index_Policy
|
||||
|
||||
|
||||
class VectorIndexBlock:
|
||||
@@ -26,11 +29,14 @@ class VectorIndexBlock:
|
||||
self.first_ptr = fp
|
||||
self.last_ptr = lp
|
||||
|
||||
def __str__(self):
|
||||
return "FirstPtr: {}, LastPrt: {}".format(self.first_ptr, self.last_ptr)
|
||||
|
||||
def encode(self) -> bytes:
|
||||
return struct.pack("<II", self.first_ptr, self.last_ptr)
|
||||
|
||||
def string(self) -> str:
|
||||
return "FirstPtr: {}, LastPrt: {}".format(self.first_ptr, self.last_ptr)
|
||||
|
||||
Segment_Index_Block_Size = 14
|
||||
|
||||
|
||||
class SegmentIndexBlock:
|
||||
@@ -45,8 +51,12 @@ class SegmentIndexBlock:
|
||||
self.data_len = dl
|
||||
self.data_ptr = dp
|
||||
|
||||
def encode(self) -> bytes:
|
||||
return struct.pack("<IIHI", self.start_ip, self.end_ip, self.data_len, self.data_ptr)
|
||||
def __str__(self):
|
||||
return "{sip: {}, eip: {}, len: {}, ptr: {}}".format(
|
||||
self.start_ip, self.end_ip, self.data_len, self.data_ptr
|
||||
)
|
||||
|
||||
def string(self) -> str:
|
||||
return "{sip: {}, eip: {}, len: {}, ptr: {}}".format(self.start_ip, self.end_ip, self.data_len, self.data_ptr)
|
||||
def encode(self) -> bytes:
|
||||
return struct.pack(
|
||||
"<IIHI", self.start_ip, self.end_ip, self.data_len, self.data_ptr
|
||||
)
|
||||
|
||||
@@ -1,5 +1,8 @@
|
||||
# Created by leolin49 on 2022/7/7.
|
||||
# Copyright (C) 2022 leolin49. All rights reserved.
|
||||
# Copyright 2022 The Ip2Region Authors. All rights reserved.
|
||||
# Use of this source code is governed by a Apache2.0-style
|
||||
# license that can be found in the LICENSE file.
|
||||
#
|
||||
# Author: leolin49 <leolin49@foxmail.com>
|
||||
#
|
||||
# ----
|
||||
# ip2region database v2.0 structure
|
||||
@@ -36,40 +39,39 @@
|
||||
#
|
||||
# data entry structure:
|
||||
# +--------------------+-----------------------+
|
||||
# | 2bytes (for desc) | dynamic length |
|
||||
# | 2bytes (for desc) | dynamic length |
|
||||
# +--------------------+-----------------------+
|
||||
# data length whatever in bytes
|
||||
#
|
||||
# index entry structure
|
||||
# +------------+-----------+---------------+------------+
|
||||
# | 4bytes | 4bytes | 2bytes | 4 bytes |
|
||||
# | 4bytes | 4bytes | 2bytes | 4 bytes |
|
||||
# +------------+-----------+---------------+------------+
|
||||
# start ip end ip data length data ptr
|
||||
import os
|
||||
import struct
|
||||
import sys
|
||||
sys.path.append(os.path.realpath(os.path.dirname(os.path.realpath(__file__))))
|
||||
import logging
|
||||
import struct
|
||||
import time
|
||||
import segment as seg
|
||||
import index as idx
|
||||
import util
|
||||
import sys
|
||||
|
||||
import xdb.segment as seg
|
||||
import xdb.index as idx
|
||||
import xdb.util as util
|
||||
|
||||
|
||||
VersionNo = 2
|
||||
HeaderInfoLength = 256
|
||||
VectorIndexRows = 256
|
||||
VectorIndexCols = 256
|
||||
VectorIndexSize = 8
|
||||
VectorIndexLength = VectorIndexRows * VectorIndexCols * VectorIndexSize
|
||||
Version_No = 2
|
||||
Header_Info_Length = 256
|
||||
Vector_Index_Rows = 256
|
||||
Vector_Index_Cols = 256
|
||||
Vector_Index_Size = 8
|
||||
Vector_Index_Length = Vector_Index_Rows * Vector_Index_Cols * Vector_Index_Size
|
||||
|
||||
|
||||
class Maker:
|
||||
src_handle = None
|
||||
dst_handle = None
|
||||
index_policy = 0
|
||||
segments = []
|
||||
region_pool = {}
|
||||
index_policy = idx.Vector_Index_Policy
|
||||
segments = None
|
||||
region_pool = None
|
||||
vector_index = None
|
||||
|
||||
def __init__(self, sh, dh, ip, sg, rp, vi):
|
||||
@@ -83,30 +85,32 @@ class Maker:
|
||||
def init(self):
|
||||
"""
|
||||
Init the `xdb` binary file.
|
||||
1. init the file header
|
||||
2. load all the segments
|
||||
1. Init the file header
|
||||
2. Load all the segments
|
||||
"""
|
||||
self.init_db_header()
|
||||
self.load_segments()
|
||||
|
||||
def init_db_header(self):
|
||||
"""Init and write the file header to the destination xdb file."""
|
||||
"""
|
||||
Init and write the file header to the destination xdb file.
|
||||
"""
|
||||
logging.info("try to init the db header ... ")
|
||||
self.src_handle.seek(0, 0)
|
||||
|
||||
header = bytearray([0]*256)
|
||||
# make and write the header space
|
||||
# 1. version number
|
||||
header[0:2] = VersionNo.to_bytes(2, byteorder="little")
|
||||
# 2. index policy code
|
||||
# Make and write the header space
|
||||
header = bytearray([0] * 256)
|
||||
# 1. Version number
|
||||
header[0:2] = Version_No.to_bytes(2, byteorder="little")
|
||||
# 2. Index policy code
|
||||
header[2:4] = int(self.index_policy).to_bytes(2, byteorder="little")
|
||||
# 3. generate unix timestamp
|
||||
# 3. Generate unix timestamp
|
||||
header[4:8] = int(time.time()).to_bytes(4, byteorder="little")
|
||||
# 4. index block start ptr
|
||||
# 4. Index block start ptr
|
||||
header[8:12] = int(0).to_bytes(4, byteorder="little")
|
||||
# 5. index block end ptr
|
||||
# 5. Index block end ptr
|
||||
header[12:16] = int(0).to_bytes(4, byteorder="little")
|
||||
# write header buffer to file
|
||||
# Write header buffer to file
|
||||
self.dst_handle.write(header)
|
||||
|
||||
def load_segments(self) -> list:
|
||||
@@ -125,122 +129,158 @@ class Maker:
|
||||
if len(ps) != 3:
|
||||
logging.error("invalid ip segment line `{}`".format(line))
|
||||
return []
|
||||
sip = util.checkip(ps[0])
|
||||
sip = util.check_ip(ps[0])
|
||||
if sip == -1:
|
||||
logging.error("invalid ip address `{}`".format(line))
|
||||
logging.error(
|
||||
"invalid ip address `{}` in line `{}`".format(ps[0], line)
|
||||
)
|
||||
return []
|
||||
eip = util.checkip(ps[1])
|
||||
eip = util.check_ip(ps[1])
|
||||
if eip == -1:
|
||||
logging.error("invalid ip address `{}`".format(line))
|
||||
logging.error(
|
||||
"invalid ip address `{}` in line `{}`".format(ps[1], line)
|
||||
)
|
||||
return []
|
||||
if sip > eip:
|
||||
logging.error("start ip({}) should not be greater than end ip({})".format(ps[0], ps[1]))
|
||||
logging.error(
|
||||
"start ip({}) should not be greater than end ip({})".format(
|
||||
ps[0], ps[1]
|
||||
)
|
||||
)
|
||||
return []
|
||||
if len(ps[2]) < 1:
|
||||
logging.error("empty region info in segment line `{}`".format(line))
|
||||
return []
|
||||
segment = seg.Segment(sip=sip, eip=eip, reg=ps[2])
|
||||
|
||||
# check the continuity of data segment
|
||||
segment = seg.Segment(sip=sip, eip=eip, reg=ps[2])
|
||||
# Check the continuity of data segment
|
||||
if last is not None:
|
||||
if last.end_ip + 1 != segment.start_ip:
|
||||
logging.error("discontinuous data segment: last.eip+1({})!=seg.sip({}, {})".format(sip, eip, ps[0]))
|
||||
logging.error(
|
||||
"discontinuous data segment: last.eip+1({})!=seg.sip({}, {})".format(
|
||||
sip, eip, ps[0]
|
||||
)
|
||||
)
|
||||
return []
|
||||
|
||||
self.segments.append(segment)
|
||||
last = segment
|
||||
logging.info("all segments loaded, length: {}, elapsed: {}".format(len(self.segments), time.time() - s_tm))
|
||||
logging.info(
|
||||
"all segments loaded, length: {}, elapsed: {}".format(
|
||||
len(self.segments), time.time() - s_tm
|
||||
)
|
||||
)
|
||||
|
||||
def set_vector_index(self, ip, ptr):
|
||||
"""
|
||||
Init and refresh the vector index based on the IP pre-two bytes.
|
||||
"""
|
||||
row, col = (ip >> 24) & 0xFF, (ip >> 16) & 0xFF
|
||||
vi_block = self.vector_index[row][col]
|
||||
if vi_block.first_ptr == 0:
|
||||
vi_block.first_ptr = ptr
|
||||
vi_block.last_ptr = ptr + idx.SegmentIndexBlockSize
|
||||
vi_block.last_ptr = ptr + idx.Segment_Index_Block_Size
|
||||
else:
|
||||
vi_block.last_ptr = ptr + idx.SegmentIndexBlockSize
|
||||
vi_block.last_ptr = ptr + idx.Segment_Index_Block_Size
|
||||
self.vector_index[row][col] = vi_block
|
||||
|
||||
def start(self):
|
||||
"""Start to make the 'xdb' binary file."""
|
||||
"""
|
||||
Start to make the 'xdb' binary file.
|
||||
"""
|
||||
if len(self.segments) < 1:
|
||||
logging.error("empty segment list")
|
||||
return
|
||||
|
||||
# 1. write all the region/data to the binary file
|
||||
self.dst_handle.seek(HeaderInfoLength+VectorIndexLength, 0)
|
||||
# 1. Write all the region/data to the binary file
|
||||
self.dst_handle.seek(Header_Info_Length + Vector_Index_Length, 0)
|
||||
|
||||
logging.info("try to write the data block ... ")
|
||||
for s in self.segments:
|
||||
logging.info("try to write region '{}'...".format(s.region))
|
||||
if s.region in self.region_pool:
|
||||
logging.info(" --[Cached] with ptr={}".format(self.region_pool[s.region]))
|
||||
logging.info(
|
||||
" --[Cached] with ptr={}".format(self.region_pool[s.region])
|
||||
)
|
||||
continue
|
||||
region = bytes(s.region, encoding="utf-8")
|
||||
if len(region) > 0xFFFF:
|
||||
logging.error("too long region info `{}`: should be less than {} bytes".format(s.region, 0xFFFF))
|
||||
logging.error(
|
||||
"too long region info `{}`: should be less than {} bytes".format(
|
||||
s.region, 0xFFFF
|
||||
)
|
||||
)
|
||||
return
|
||||
|
||||
# get the first ptr of the next region
|
||||
# Get the first ptr of the next region
|
||||
pos = self.dst_handle.seek(0, 1)
|
||||
logging.info("{} {} {}".format(pos, region, s.region))
|
||||
self.dst_handle.write(region)
|
||||
self.region_pool[s.region] = pos
|
||||
logging.info(" --[Added] with ptr={}".format(pos))
|
||||
# 2. write the index block and cache the super index block
|
||||
# 2. Write the index block and cache the super index block
|
||||
logging.info("try to write the segment index block ... ")
|
||||
counter, start_index_ptr, end_index_ptr = 0, -1, -1
|
||||
for sg in self.segments:
|
||||
data_ptr = -1
|
||||
if sg.region in self.region_pool:
|
||||
data_ptr = self.region_pool[sg.region]
|
||||
else:
|
||||
if sg.region not in self.region_pool:
|
||||
logging.error("missing ptr cache for region `{}`".format(sg.region))
|
||||
return
|
||||
|
||||
data_len = len(bytes(sg.region, encoding="utf-8"))
|
||||
if data_len < 1:
|
||||
logging.error("empty region info for segment '{}'".format(sg.region))
|
||||
return
|
||||
|
||||
seg_list = sg.split()
|
||||
logging.info("try to index segment({} split) {} ...".format(len(seg_list), sg.string()))
|
||||
logging.info(
|
||||
"try to index segment({} split) {} ...".format(len(seg_list), sg)
|
||||
)
|
||||
for s in seg_list:
|
||||
pos = self.dst_handle.seek(0, 1)
|
||||
|
||||
s_index = idx.SegmentIndexBlock(
|
||||
sip=s.start_ip, eip=s.end_ip, dl=data_len, dp=data_ptr
|
||||
sip=s.start_ip,
|
||||
eip=s.end_ip,
|
||||
dl=data_len,
|
||||
dp=self.region_pool[sg.region],
|
||||
)
|
||||
self.dst_handle.write(s_index.encode())
|
||||
logging.info("|-segment index: {}, ptr: {}, segment: {}".format(counter, pos, s.string()))
|
||||
logging.info(
|
||||
"|-segment index: {}, ptr: {}, segment: {}".format(counter, pos, s)
|
||||
)
|
||||
self.set_vector_index(s.start_ip, pos)
|
||||
counter += 1
|
||||
|
||||
# check and record the start index ptr
|
||||
# Check and record the start index ptr
|
||||
if start_index_ptr == -1:
|
||||
start_index_ptr = pos
|
||||
end_index_ptr = pos
|
||||
|
||||
# synchronized the vector index block
|
||||
# 3. Synchronized the vector index block
|
||||
logging.info("try to write the vector index block ... ")
|
||||
self.dst_handle.seek(HeaderInfoLength, 0)
|
||||
self.dst_handle.seek(Header_Info_Length, 0)
|
||||
for i in range(0, len(self.vector_index)):
|
||||
for j in range(0, len(self.vector_index[i])):
|
||||
vi = self.vector_index[i][j]
|
||||
self.dst_handle.write(vi.encode())
|
||||
|
||||
# synchronized the segment index info
|
||||
# 4. Synchronized the segment index info
|
||||
logging.info("try to write the segment index ptr ... ")
|
||||
buff = struct.pack("<II", start_index_ptr, end_index_ptr)
|
||||
self.dst_handle.seek(8, 0)
|
||||
self.dst_handle.write(buff)
|
||||
|
||||
logging.info("write done, dataBlocks: {}, indexBlocks: ({}, {}), indexPtr: ({}, {})".format(
|
||||
len(self.region_pool), len(self.segments), counter, start_index_ptr, end_index_ptr
|
||||
))
|
||||
logging.info(
|
||||
"write done, dataBlocks: {}, indexBlocks: ({}, {}), indexPtr: ({}, {})".format(
|
||||
len(self.region_pool),
|
||||
len(self.segments),
|
||||
counter,
|
||||
start_index_ptr,
|
||||
end_index_ptr,
|
||||
)
|
||||
)
|
||||
|
||||
def end(self):
|
||||
"""End of make the 'xdb' binary file."""
|
||||
"""
|
||||
End of make the 'xdb' binary file.
|
||||
"""
|
||||
try:
|
||||
self.src_handle.close()
|
||||
self.dst_handle.close()
|
||||
@@ -250,20 +290,27 @@ class Maker:
|
||||
|
||||
|
||||
def new_maker(policy: int, srcfile: str, dstfile: str) -> Maker:
|
||||
"""Create a xdb Maker to make the xdb binary file
|
||||
"""
|
||||
Create a xdb Maker to make the xdb binary file
|
||||
:param policy: index algorithm code 1:vector, 2:b-tree
|
||||
:param srcfile: source ip text file path
|
||||
:param dstfile: destination binary xdb file path
|
||||
:return: the 'xdb' Maker
|
||||
"""
|
||||
try:
|
||||
sh = open(srcfile, mode='r', encoding='utf-8')
|
||||
dh = open(dstfile, mode='wb')
|
||||
sh = open(srcfile, mode="r", encoding="utf-8")
|
||||
dh = open(dstfile, mode="wb")
|
||||
return Maker(
|
||||
sh=sh, dh=dh, ip=policy, sg=[], rp={},
|
||||
vi=[[idx.VectorIndexBlock() for _ in range(VectorIndexRows)] for _ in range(VectorIndexCols)],
|
||||
sh=sh,
|
||||
dh=dh,
|
||||
ip=policy,
|
||||
sg=[],
|
||||
rp={},
|
||||
vi=[
|
||||
[idx.VectorIndexBlock() for _ in range(Vector_Index_Rows)]
|
||||
for _ in range(Vector_Index_Cols)
|
||||
],
|
||||
)
|
||||
except IOError as e:
|
||||
logging.error(e)
|
||||
sys.exit()
|
||||
|
||||
|
||||
@@ -1,161 +0,0 @@
|
||||
import socket
|
||||
import struct
|
||||
import io
|
||||
import sys
|
||||
|
||||
HeaderInfoLength = 256
|
||||
VectorIndexRows = 256
|
||||
VectorIndexCols = 256
|
||||
VectorIndexSize = 8
|
||||
SegmentIndexSize = 14
|
||||
|
||||
|
||||
class XdbSearcher(object):
|
||||
__f = None
|
||||
|
||||
# the minimal memory allocation.
|
||||
vectorIndex = None
|
||||
# 整个读取xdb,保存在内存中
|
||||
contentBuff = None
|
||||
|
||||
@staticmethod
|
||||
def loadVectorIndexFromFile(dbfile):
|
||||
try:
|
||||
f = io.open(dbfile, "rb")
|
||||
f.seek(HeaderInfoLength)
|
||||
vi_len = VectorIndexRows * VectorIndexCols * SegmentIndexSize
|
||||
vector_data = f.read(vi_len)
|
||||
f.close()
|
||||
return vector_data
|
||||
except IOError as e:
|
||||
print("[Error]: %s" % e)
|
||||
|
||||
@staticmethod
|
||||
def loadContentFromFile(dbfile):
|
||||
try:
|
||||
f = io.open(dbfile, "rb")
|
||||
all_data = f.read()
|
||||
f.close()
|
||||
return all_data
|
||||
except IOError as e:
|
||||
print("[Error]: %s" % e)
|
||||
|
||||
def __init__(self, dbfile=None, vectorIndex=None, contentBuff=None):
|
||||
self.initDatabase(dbfile, vectorIndex, contentBuff)
|
||||
|
||||
def search(self, ip):
|
||||
if isinstance(ip, str):
|
||||
if not ip.isdigit(): ip = self.ip2long(ip)
|
||||
return self.searchByIPLong(ip)
|
||||
else:
|
||||
return self.searchByIPLong(ip)
|
||||
|
||||
def searchByIPStr(self, ip):
|
||||
if not ip.isdigit(): ip = self.ip2long(ip)
|
||||
return self.searchByIPLong(ip)
|
||||
|
||||
def searchByIPLong(self, ip):
|
||||
# locate the segment index block based on the vector index
|
||||
sPtr = ePtr = 0
|
||||
il0 = (int)((ip >> 24) & 0xFF)
|
||||
il1 = (int)((ip >> 16) & 0xFF)
|
||||
idx = il0 * VectorIndexCols * VectorIndexSize + il1 * VectorIndexSize
|
||||
|
||||
if self.vectorIndex is not None:
|
||||
sPtr = self.getLong(self.vectorIndex, idx)
|
||||
ePtr = self.getLong(self.vectorIndex, idx + 4)
|
||||
elif self.contentBuff is not None:
|
||||
sPtr = self.getLong(self.contentBuff, HeaderInfoLength + idx)
|
||||
ePtr = self.getLong(self.contentBuff, HeaderInfoLength + idx + 4)
|
||||
else:
|
||||
self.__f.seek(HeaderInfoLength + idx)
|
||||
buffer_ptr = self.__f.read(8)
|
||||
sPtr = self.getLong(buffer_ptr, 0)
|
||||
ePtr = self.getLong(buffer_ptr, 4)
|
||||
|
||||
# binary search the segment index block to get the region info
|
||||
dataLen = dataPtr = int(-1)
|
||||
l = int(0)
|
||||
h = int((ePtr - sPtr) / SegmentIndexSize)
|
||||
while l <= h:
|
||||
m = int((l + h) >> 1)
|
||||
p = int(sPtr + m * SegmentIndexSize)
|
||||
# read the segment index
|
||||
buffer_sip = self.readBuffer(p, SegmentIndexSize)
|
||||
sip = self.getLong(buffer_sip, 0)
|
||||
if ip < sip:
|
||||
h = m - 1
|
||||
else:
|
||||
eip = self.getLong(buffer_sip, 4)
|
||||
if ip > eip:
|
||||
l = m + 1
|
||||
else:
|
||||
dataLen = self.getInt2(buffer_sip, 8)
|
||||
dataPtr = self.getLong(buffer_sip, 10)
|
||||
break
|
||||
|
||||
# empty match interception
|
||||
if dataPtr < 0:
|
||||
return ""
|
||||
|
||||
buffer_string = self.readBuffer(dataPtr, dataLen)
|
||||
return_string = buffer_string.decode("utf-8")
|
||||
return return_string
|
||||
|
||||
def readBuffer(self, offset, length):
|
||||
buffer = None
|
||||
# check the in-memory buffer first
|
||||
if self.contentBuff is not None:
|
||||
buffer = self.contentBuff[offset:offset + length]
|
||||
return buffer
|
||||
|
||||
# read from the file handle
|
||||
if self.__f is not None:
|
||||
self.__f.seek(offset)
|
||||
buffer = self.__f.read(length)
|
||||
return buffer
|
||||
|
||||
def initDatabase(self, dbfile, vi, cb):
|
||||
"""
|
||||
" initialize the database for search
|
||||
" param: dbFile, vectorIndex, contentBuff
|
||||
"""
|
||||
try:
|
||||
if cb is not None:
|
||||
self.__f = None
|
||||
self.vectorIndex = None
|
||||
self.contentBuff = cb
|
||||
else:
|
||||
self.__f = io.open(dbfile, "rb")
|
||||
self.vectorIndex = vi
|
||||
except IOError as e:
|
||||
print("[Error]: %s" % e)
|
||||
sys.exit()
|
||||
|
||||
def ip2long(self, ip):
|
||||
_ip = socket.inet_aton(ip)
|
||||
return struct.unpack("!L", _ip)[0]
|
||||
|
||||
def isip(self, ip):
|
||||
p = ip.split(".")
|
||||
|
||||
if len(p) != 4: return False
|
||||
for pp in p:
|
||||
if not pp.isdigit(): return False
|
||||
if len(pp) > 3: return False
|
||||
if int(pp) > 255: return False
|
||||
return True
|
||||
|
||||
def getLong(self, b, offset):
|
||||
if len(b[offset:offset + 4]) == 4:
|
||||
return struct.unpack('I', b[offset:offset + 4])[0]
|
||||
return 0
|
||||
|
||||
def getInt2(self, b, offset):
|
||||
return ((b[offset] & 0x000000FF) | (b[offset + 1] & 0x0000FF00))
|
||||
|
||||
def close(self):
|
||||
if self.__f is not None:
|
||||
self.__f.close()
|
||||
self.vectorIndex = None
|
||||
self.contentBuff = None
|
||||
@@ -1,6 +1,10 @@
|
||||
# Created by leolin49 on 2022/7/7.
|
||||
# Copyright (C) 2022 leolin49. All rights reserved.
|
||||
import util
|
||||
# Copyright 2022 The Ip2Region Authors. All rights reserved.
|
||||
# Use of this source code is governed by a Apache2.0-style
|
||||
# license that can be found in the LICENSE file.
|
||||
#
|
||||
# Author: leolin49 <leolin49@foxmail.com>
|
||||
#
|
||||
import xdb.util as util
|
||||
|
||||
|
||||
class Segment:
|
||||
@@ -12,9 +16,30 @@ class Segment:
|
||||
self.start_ip, self.end_ip = sip, eip
|
||||
self.region = reg
|
||||
|
||||
def __str__(self):
|
||||
return "{}|{}|{}".format(
|
||||
util.long2ip(self.start_ip), util.long2ip(self.end_ip), self.region
|
||||
)
|
||||
|
||||
def split(self) -> list:
|
||||
"""Split the segment based on the pre-two bytes."""
|
||||
# 1, split the segment with the first byte
|
||||
"""
|
||||
Split the segment based on the pre-two bytes.
|
||||
:return: the list of segment ofter split
|
||||
"""
|
||||
# Example:
|
||||
# split the segment "116.31.76.0|117.21.79.49|region"
|
||||
#
|
||||
# Return the list with segments:
|
||||
# 116.31.76.0 | 116.31.255.255 | region
|
||||
# 116.32.0.0 | 116.32.255.255 | region
|
||||
# ... | ... | region
|
||||
# 116.255.0.0 | 116.255.255.255 | region
|
||||
# 117.0.0.0 | 117.0.255.255 | region
|
||||
# 117.1.0.0 | 117.1.255.255 | region
|
||||
# ... | ... | region
|
||||
# 117.21.0.0 | 117.21.79.49 | region
|
||||
|
||||
# 1. Split the segment with the first byte
|
||||
t_list_1 = []
|
||||
s_byte_1, e_byte_1 = (self.start_ip >> 24) & 0xFF, (self.end_ip >> 24) & 0xFF
|
||||
n_sip = self.start_ip
|
||||
@@ -25,11 +50,10 @@ class Segment:
|
||||
n_sip = (i + 1) << 24
|
||||
else:
|
||||
eip = self.end_ip
|
||||
|
||||
# append the new segment (maybe)
|
||||
# Append the new segment (maybe)
|
||||
t_list_1.append(Segment(sip, eip))
|
||||
|
||||
# 2, split the segments with the second byte
|
||||
# 2. Split the segments with the second byte
|
||||
t_list_2 = []
|
||||
for s in t_list_1:
|
||||
base = s.start_ip & 0xFF000000
|
||||
@@ -42,28 +66,5 @@ class Segment:
|
||||
n_sip = 0
|
||||
else:
|
||||
eip = self.end_ip
|
||||
|
||||
t_list_2.append(Segment(sip, eip, self.region))
|
||||
|
||||
return t_list_2
|
||||
|
||||
def string(self) -> str:
|
||||
return util.long2ip(self.start_ip) + "|" + util.long2ip(self.end_ip) + "|" + self.region
|
||||
|
||||
|
||||
def segment_from(seg: str) -> Segment:
|
||||
segment = Segment()
|
||||
ps = seg.split("|", 3)
|
||||
if len(ps) != 3:
|
||||
return segment
|
||||
|
||||
sip = util.checkip(ps[0])
|
||||
if sip == -1:
|
||||
return segment
|
||||
eip = util.checkip(ps[1])
|
||||
if eip == -1:
|
||||
return segment
|
||||
|
||||
segment.start_ip, segment.end_ip = sip, eip
|
||||
segment.region = ps[2]
|
||||
return segment
|
||||
|
||||
@@ -1,42 +1,47 @@
|
||||
# Created by leolin49 on 2022/7/7.
|
||||
# Copyright (C) 2022 leolin49. All rights reserved.
|
||||
|
||||
shift_index = (24, 16, 8, 0)
|
||||
# Util function
|
||||
# Copyright 2022 The Ip2Region Authors. All rights reserved.
|
||||
# Use of this source code is governed by a Apache2.0-style
|
||||
# license that can be found in the LICENSE file.
|
||||
#
|
||||
# Author: leolin49 <leolin49@foxmail.com>
|
||||
#
|
||||
_SHIFT_INDEX = (24, 16, 8, 0)
|
||||
|
||||
|
||||
def checkip(ip: str) -> int:
|
||||
"""Convert ip string to integer."""
|
||||
def check_ip(ip: str) -> int:
|
||||
"""
|
||||
Convert ip string to integer.
|
||||
Return -1 if ip is not the correct ipv4 address.
|
||||
"""
|
||||
if not is_ipv4(ip):
|
||||
return -1
|
||||
ps = ip.split(".")
|
||||
if len(ps) != 4:
|
||||
return 0
|
||||
val = 0
|
||||
for i in range(len(ps)):
|
||||
d = int(ps[i])
|
||||
if d < 0 or d > 255:
|
||||
return 0
|
||||
val |= d << shift_index[i]
|
||||
val |= d << _SHIFT_INDEX[i]
|
||||
return val
|
||||
|
||||
|
||||
def long2ip(num: int) -> str:
|
||||
"""Convert integer to ip string."""
|
||||
return "{}.{}.{}.{}".format((num >> 24) & 0xFF, (num >> 16) & 0xFF, (num >> 8) & 0xFF, num & 0xFF)
|
||||
|
||||
|
||||
def mid_ip(sip: int, eip: int):
|
||||
"""Get the middle ip between sip and eip."""
|
||||
return (sip + eip) >> 1
|
||||
"""
|
||||
Convert integer to ip string.
|
||||
Return empty string if the num greater than UINT32_MAX or less than 0.
|
||||
"""
|
||||
if num < 0 or num > 0xFFFFFFFF:
|
||||
return ""
|
||||
return "{}.{}.{}.{}".format(
|
||||
(num >> 24) & 0xFF, (num >> 16) & 0xFF, (num >> 8) & 0xFF, num & 0xFF
|
||||
)
|
||||
|
||||
|
||||
def is_ipv4(ip: str) -> bool:
|
||||
"""Determine whether it is an ipv4 address."""
|
||||
p = ip.split(".")
|
||||
if len(p) != 4:
|
||||
"""
|
||||
Determine whether it is an ipv4 address.
|
||||
"""
|
||||
ps = ip.split(".")
|
||||
if len(ps) != 4:
|
||||
return False
|
||||
for pp in p:
|
||||
if not pp.isdigit() or len(pp) > 3 or int(pp) > 255:
|
||||
for p in ps:
|
||||
if not p.isdigit() or len(p) > 3 or (int(p) < 0 or int(p) > 255):
|
||||
return False
|
||||
return True
|
||||
|
||||
Reference in New Issue
Block a user