From 77cd029519b85e42b8e4edbfb6b315ffad358750 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Sat, 15 May 2021 08:55:41 +0000 Subject: [PATCH] added tsv to bin file convertor --- include/distance.h | 4 +- tests/utils/CMakeLists.txt | 7 +++ tests/utils/compute_groundtruth.cpp | 27 +++-------- tests/utils/tsv_to_bin.cpp | 72 +++++++++++++++++++++++++++++ 4 files changed, 86 insertions(+), 24 deletions(-) create mode 100644 tests/utils/tsv_to_bin.cpp diff --git a/include/distance.h b/include/distance.h index a4f311e51..3d403d1e5 100644 --- a/include/distance.h +++ b/include/distance.h @@ -226,9 +226,7 @@ namespace diskann { } }; - // Gopal. Slow implementations of the distance functions to get diskann to - // work in v14 machines that do not have AVX2 support. Performance here is not - // a concern, so we are using the simplest possible implementation. + // Slow implementations of the distance functions for machines without AVX2 template class SlowDistanceL2Int : public Distance { virtual float compare(const T *a, const T *b, unsigned length) const { diff --git a/tests/utils/CMakeLists.txt b/tests/utils/CMakeLists.txt index a343e377b..6f7cb32b8 100644 --- a/tests/utils/CMakeLists.txt +++ b/tests/utils/CMakeLists.txt @@ -17,6 +17,13 @@ if(MSVC) target_link_libraries(ivecs_to_bin optimized ${CMAKE_LIBRARY_OUTPUT_DIRECTORY_RELEASE}/diskann_dll.lib) endif() +add_executable(tsv_to_bin tsv_to_bin.cpp) +if(MSVC) + target_link_options(tsv_to_bin PRIVATE /MACHINE:x64) + target_link_libraries(tsv_to_bin debug ${CMAKE_LIBRARY_OUTPUT_DIRECTORY_DEBUG}/diskann_dll.lib) + target_link_libraries(tsv_to_bin optimized ${CMAKE_LIBRARY_OUTPUT_DIRECTORY_RELEASE}/diskann_dll.lib) +endif() + add_executable(int8_to_float int8_to_float.cpp) if(MSVC) target_link_options(int8_to_float PRIVATE /MACHINE:x64) diff --git a/tests/utils/compute_groundtruth.cpp b/tests/utils/compute_groundtruth.cpp index ed6f08b0b..8fef8c929 100644 --- a/tests/utils/compute_groundtruth.cpp +++ b/tests/utils/compute_groundtruth.cpp @@ -116,9 +116,7 @@ void exact_knn(const size_t dim, const size_t k, const float *const queries) // queries in Col major { float *points_l2sq = new float[npoints]; - // std::cout<<"jere"<(nptsuint64_t * ndimsuint64_t, ALIGNMENT); #pragma omp parallel for schedule(dynamic, 32768) for (int64_t i = 0; i < (int64_t) nptsuint64_t; i++) { @@ -278,11 +266,7 @@ inline void save_groundtruth_as_one_file(const std::string filename, template int aux_main(int argv, char **argc) { - if (argv != 6) { - command_line_help(); - return -1; - } - + size_t npoints, nqueries, dim; std::string base_file(argc[2]); std::string query_file(argc[3]); @@ -331,10 +315,6 @@ int aux_main(int argv, char **argc) { } } - // save_bin(gt_file + std::string("_ids.bin"), closest_points, nqueries, - // k); - // save_bin(gt_file + std::string("_dist.bin"), dist_closest_points, - // nqueries, k); save_groundtruth_as_one_file(gt_file, closest_points, dist_closest_points, nqueries, k); diskann::aligned_free(query_data); @@ -344,6 +324,11 @@ int aux_main(int argv, char **argc) { } int main(int argc, char **argv) { + if (argc != 6) { + command_line_help(); + return -1; + } + if (std::string(argv[1]) == std::string("float")) aux_main(argc, argv); if (std::string(argv[1]) == std::string("int8")) diff --git a/tests/utils/tsv_to_bin.cpp b/tests/utils/tsv_to_bin.cpp new file mode 100644 index 000000000..111a6bb55 --- /dev/null +++ b/tests/utils/tsv_to_bin.cpp @@ -0,0 +1,72 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#include +#include "utils.h" + +template +void block_convert(std::ifstream& reader, std::ofstream& writer, _u64 npts, + _u64 ndims) { + auto read_buf = new T[4 * npts * (ndims + 1)]; + + auto cursor = read_buf; + T val; + + for (_u64 i = 0; i < npts; i++) { + for (_u64 d = 0; d < ndims; ++d) { + reader >> val; + *cursor = val; + cursor++; + } + } + writer.write((char*) read_buf, npts * ndims * sizeof(T)); + delete[] read_buf; +} + +int main(int argc, char** argv) { + if (argc != 6) { + std::cout << argv[0] + << " input_filename.tsv output_filename.bin dim num_pts>" + << std::endl; + exit(-1); + } + + if (std::string(argv[1]) != std::string("float") && + std::string(argv[1]) != std::string("int8") && + std::string(argv[1]) != std::string("uint8")) { + std::cout << "Unsupported type. float, int8 and uint8 types are supported." + << std::endl; + } + + _u64 ndims = atoi(argv[4]); + _u64 npts = atoi(argv[5]); + + std::ifstream reader(argv[2], std::ios::binary | std::ios::ate); + // _u64 fsize = reader.tellg(); + reader.seekg(0, std::ios::beg); + reader.seekg(0, std::ios::beg); + + _u64 blk_size = 131072; + _u64 nblks = ROUND_UP(npts, blk_size) / blk_size; + std::cout << "# blks: " << nblks << std::endl; + std::ofstream writer(argv[3], std::ios::binary); + auto npts_s32 = (_u32) npts; + auto ndims_s32 = (_u32) ndims; + writer.write((char*) &npts_s32, sizeof(_u32)); + writer.write((char*) &ndims_s32, sizeof(_u32)); + + for (_u64 i = 0; i < nblks; i++) { + _u64 cblk_size = std::min(npts - i * blk_size, blk_size); + if (std::string(argv[1]) == std::string("float")) { + block_convert(reader, writer, cblk_size, ndims); + } else if (std::string(argv[1]) == std::string("int8")) { + block_convert(reader, writer, cblk_size, ndims); + } else if (std::string(argv[1]) == std::string("uint8")) { + block_convert(reader, writer, cblk_size, ndims); + } + std::cout << "Block #" << i << " written" << std::endl; + } + + reader.close(); + writer.close(); +}