--- Empirical benchmark for save_to_file_async batching.

local modname = core.get_current_modname()
local S = core.get_translator(modname)

map_octree.autotune = {}



function map_octree.autotune.run(name, param)
	local player = core.get_player_by_name(name)
	if not player then return false, S("Player not found") end

	if map_octree._autotune_running then
		return false, S("Autotune already running")
	end

	param = tostring(param or "")

	local budget_ms = tonumber(param:match("^%s*(%d+)"))
	if budget_ms == nil then
		budget_ms = tonumber(core.settings:get("map_octree_read_budget_ms") or "") or 50
	end
	budget_ms = math.max(5, math.floor(budget_ms))

	-- Get engine's async worker capacity (num_cpus - 2, with autoscaling)
	local engine_capacity = core.get_async_threading_capacity() or 4
	-- Cap at engine capacity - no point queueing more jobs than workers
	local autotune_inflight_cap = math.max(1, engine_capacity)
	local size = octchunk and octchunk.SIZE or 16

	local test_trees_per_axis = 20
	local ppos = vector.round(player:get_pos())
	local half_extent = math.floor((test_trees_per_axis * size) / 2)
	local test_pos1 = vector.new(ppos.x - half_extent, ppos.y - half_extent, ppos.z - half_extent)
	local test_pos2 = vector.new(ppos.x + half_extent - 1, ppos.y + half_extent - 1, ppos.z + half_extent - 1)

	-- Calculate actual tree count for the test area
	local snapped1 = octchunk.snap_to_center(test_pos1)
	local snapped2 = octchunk.snap_to_center(test_pos2)
	local trees_x = math.floor((snapped2.x - snapped1.x) / size) + 1
	local trees_y = math.floor((snapped2.y - snapped1.y) / size) + 1
	local trees_z = math.floor((snapped2.z - snapped1.z) / size) + 1
	local total_test_trees = trees_x * trees_y * trees_z

	-- Configs to test: k = batch trees per axis
	-- Higher budget allows testing larger k values (bigger batches)
	local configs = {2, 3, 4, 5, 6, 7, 8, 10, 12, 16, 20}
	local results = {}

	-- Generate offsets dynamically for each k to avoid cache contamination
	local area_offsets = {}
	local offset_patterns = {
		{x = 0,     y = 0, z = 0},
		{x = 2048,  y = 0, z = 0},
		{x = -2048, y = 0, z = 0},
		{x = 0,     y = 0, z = 2048},
		{x = 0,     y = 0, z = -2048},
		{x = 2048,  y = 0, z = 2048},
		{x = -2048, y = 0, z = -2048},
		{x = 4096,  y = 0, z = 0},
		{x = -4096, y = 0, z = 0},
		{x = 0,     y = 0, z = 4096},
		{x = 0,     y = 0, z = -4096},
		{x = 4096,  y = 0, z = 4096},
	}
	for i, k in ipairs(configs) do
		area_offsets[k] = offset_patterns[i] or {x = i * 2048, y = 0, z = 0}
	end

	map_octree._autotune_running = true

	core.chat_send_player(name, S(
		"[octree_autotune] Empirical benchmark: budget=@1ms, test_area=@2 trees (@3x@4x@5)",
		budget_ms, total_test_trees, trees_x, trees_y, trees_z
	))
	core.chat_send_player(name, S("[octree_autotune] k = trees per axis in the batch (batch = k^3 trees)"))
	core.chat_send_player(name, S("[octree_autotune] Testing configs: k=@1 (separate areas to avoid cache)", table.concat(configs, ", ")))

	local config_idx = 0
	local finish_benchmark -- forward declaration

	---Run the next config in the autotune sequence.
	local function test_next_config()
		config_idx = config_idx + 1
		if config_idx > #configs then
			finish_benchmark()
			return
		end

		local k = configs[config_idx]
		local nodes = k * size + 1
		local volume = nodes * nodes * nodes

		-- Use different area for each k to avoid cache contamination
		local offset = area_offsets[k] or {x = 0, y = 0, z = 0}
		local test_center = vector.new(ppos.x + offset.x, ppos.y + offset.y, ppos.z + offset.z)
		local k_test_pos1 = vector.new(test_center.x - half_extent, test_center.y - half_extent, test_center.z - half_extent)
		local k_test_pos2 = vector.new(test_center.x + half_extent - 1, test_center.y + half_extent - 1, test_center.z + half_extent - 1)

		-- Calculate inflight based on budget (same logic as production)
		-- We need to measure read time first
		local read_samples = {}

		---Measure a single VoxelManip read duration in microseconds.
		---@return number
		local function measure_read()
			local half = math.floor(nodes / 2)
			local minp = vector.new(test_center.x - half, test_center.y - half, test_center.z - half)
			local maxp = vector.new(minp.x + nodes - 1, minp.y + nodes - 1, minp.z + nodes - 1)
			local t0 = core.get_us_time()
			core.load_area(minp, maxp)
			local manip = core.get_voxel_manip()
			manip:read_from_map(minp, maxp)
			local dt = core.get_us_time() - t0
			manip:close()
			return dt
		end

		-- Quick read measurement (7 samples, skip first as warmup)
		measure_read() -- warmup, discard
		for _ = 1, 7 do
			read_samples[#read_samples + 1] = measure_read()
		end
		table.sort(read_samples)
		local read_p95_ms = read_samples[#read_samples] / 1000

		-- Skip if read exceeds budget
		if read_p95_ms > budget_ms then
			core.chat_send_player(name, S(
				"  k=@1: SKIP (read=@2ms > budget=@3ms)",
				k, string.format("%.0f", read_p95_ms), budget_ms
			))
			core.after(0.1, test_next_config)
			return
		end

		-- Calculate inflight based on budget/read_time, capped by autotune limit
		local inflight = math.floor(budget_ms / math.max(0.1, read_p95_ms))
		inflight = math.max(1, math.min(autotune_inflight_cap, inflight))

		core.chat_send_player(name, S(
			"  k=@1: read=@2ms, inflight=@3, testing...",
			k, string.format("%.0f", read_p95_ms), inflight
		))

		-- Run actual save_to_file_async and measure real throughput
		local start_time = core.get_us_time()
		map_octree.save_to_file_async(k_test_pos1, k_test_pos2, {
			file_name = "_autotune_test",
			subdir = "bench",
			async_inflight = inflight,
			read_budget_ms = budget_ms,
			max_voxelmanip_volume = volume,
			flush_async_workers = false,
		}, function(ok, result, stats)
			local elapsed_ms = (core.get_us_time() - start_time) / 1000
			if not ok then
				core.chat_send_player(name, S(
					"  k=@1: FAILED - @2",
					k, tostring(result)
				))
				core.after(0.1, test_next_config)
				return
			end

			local chunk_count = result.trees.size.x * result.trees.size.y * result.trees.size.z
			local throughput = chunk_count / elapsed_ms * 1000

			results[#results + 1] = {
				k = k,
				volume = volume,
				inflight = inflight,
				read_ms = read_p95_ms,
				elapsed_ms = elapsed_ms,
				chunk_count = chunk_count,
				throughput = throughput,
			}

			core.chat_send_player(name, S(
				"  k=@1: @2 chunks in @3s → @4 chunks/s (inflight=@5)",
				k, chunk_count, string.format("%.1f", elapsed_ms / 1000), string.format("%.0f", throughput), inflight
			))

			-- Cleanup and continue
			collectgarbage("collect")
			core.after(0.2, test_next_config)
		end)
	end

	finish_benchmark = function()
		map_octree._autotune_running = false

		-- Delete test file
		local bench_dir
		if map_octree.get_storage_dir then
			bench_dir = map_octree.get_storage_dir("bench")
		else
			bench_dir = core.get_worldpath() .. "/map_octree/bench"
		end
		os.remove(bench_dir .. "/_autotune_test.bin")

		if #results == 0 then
			core.chat_send_player(name, S("[octree_autotune] No valid configurations found!"))
			return
		end

		-- Find best throughput
		local best = nil
		for _, r in ipairs(results) do
			if not best or r.throughput > best.throughput then
				best = r
			end
		end

		-- Reference projection
		local ref_trees = 67000
		local est_time_s = ref_trees / best.throughput

		core.chat_send_player(name, "")
		core.chat_send_player(name, S("[octree_autotune] === RESULTS ==="))
		for _, r in ipairs(results) do
			local marker = (r == best) and " <<<" or ""
			core.chat_send_player(name, S(
				"  k=@1: @2 trees/s (read=@3ms, inflight=@4)@5",
				r.k, string.format("%.0f", r.throughput), string.format("%.0f", r.read_ms), r.inflight, marker
			))
		end

		core.chat_send_player(name, S(
			"[octree_autotune] BEST: k=@1, volume=@2, inflight=@3 → @4 trees/s",
			best.k, best.volume, best.inflight, string.format("%.0f", best.throughput)
		))
		core.chat_send_player(name, S(
			"[octree_autotune] Projected: @1k trees in ~@2s",
			math.floor(ref_trees / 1000), string.format("%.0f", est_time_s)
		))

		-- Check if all configs hit the autotune inflight cap
		local all_at_cap = true
		for _, r in ipairs(results) do
			if r.inflight < autotune_inflight_cap then
				all_at_cap = false
				break
			end
		end
		if all_at_cap then
			core.chat_send_player(name, S(
				"[octree_autotune] NOTE: CPU-bound (@1 async workers = num_cpus-2). This is your hardware limit.",
				autotune_inflight_cap
			))
		end

		core.settings:set("map_octree_max_voxelmanip_volume", tostring(best.volume))
		core.settings:set("map_octree_read_budget_ms", tostring(budget_ms))

		-- Save inflight override if best result differs from hardware max
		if best.inflight < autotune_inflight_cap then
			core.settings:set("map_octree_async_inflight_override", tostring(best.inflight))
			core.chat_send_player(name, S(
				"[octree_autotune] Applied: volume=@1, budget=@2ms, inflight=@3 (override: fewer workers faster)",
				best.volume, budget_ms, best.inflight
			))
		else
			-- Clear override - use engine capacity
			core.settings:set("map_octree_async_inflight_override", "0")
			core.chat_send_player(name, S(
				"[octree_autotune] Applied: volume=@1, budget=@2ms, inflight=@3 (from CPU count)",
				best.volume, budget_ms, best.inflight
			))
		end
	end

	core.after(0.1, test_next_config)
	return true, S("octree_autotune started (empirical benchmark, ~30-60s)")
end
