--- Empirical benchmark for save_to_file_async batching.

local modname = core.get_current_modname()
local S = core.get_translator(modname)

map_octree.autotune = {}

local parse_budget_ms
local build_inflight_options
local measure_batch_ms
local run_preflight
local compute_test_area
local build_configs
local build_area_offsets
local send_intro_messages
local run_next_config
local run_next_inflight
local handle_inflight_result
local finish_benchmark



function map_octree.autotune.run(name, param)
	local player = core.get_player_by_name(name)
	if not player then return false, S("Player not found") end

	if map_octree._autotune_running then
		return false, S("Autotune already running")
	end

	local budget_ms = parse_budget_ms(param)

	-- Get engine's async worker capacity (num_cpus - 2, with autoscaling)
	local engine_capacity = core.get_async_threading_capacity() or 4
	local size = octchunk.SIZE
	local inflight_options = build_inflight_options(engine_capacity)

	local ppos = vector.round(player:get_pos())
	local preflight_ms, test_trees_per_axis = run_preflight(ppos, size, budget_ms)
	local half_extent, trees_x, trees_y, trees_z, total_test_trees = compute_test_area(
		ppos, size, test_trees_per_axis
	)

	-- Configs to test: k = batch trees per axis
	-- Higher budget allows testing larger k values (bigger batches)
	local configs = build_configs(preflight_ms, budget_ms, test_trees_per_axis)
	local results = {}

	-- Generate offsets dynamically for each k to avoid cache contamination
	local area_offsets = build_area_offsets(configs)

	map_octree._autotune_running = true

	send_intro_messages(
		name,
		budget_ms,
		total_test_trees,
		trees_x,
		trees_y,
		trees_z,
		test_trees_per_axis,
		preflight_ms,
		inflight_options,
		configs
	)

	local ctx = {
		name = name,
		budget_ms = budget_ms,
		size = size,
		ppos = ppos,
		half_extent = half_extent,
		engine_capacity = engine_capacity,
		inflight_options = inflight_options,
		configs = configs,
		results = results,
		area_offsets = area_offsets,
		config_idx = 0,
		budget_exceeded = false,
	}

	core.after(0.1, function() run_next_config(ctx) end)
	return true, S("octree_autotune started (empirical benchmark, may take several minutes)")
end



function parse_budget_ms(param)
	param = tostring(param or "")

	local budget_ms = tonumber(param:match("^%s*(%d+)"))
	if budget_ms == nil then
		budget_ms = tonumber(core.settings:get("map_octree_read_budget_ms") or "") or 50
	end
	return math.max(5, math.floor(budget_ms))
end



function build_inflight_options(engine_capacity)
	local inflight_options = {}
	local inflight_seen = {}
	local min_inflight = math.min(2, engine_capacity)
	local function add_inflight_option(value)
		local v = math.max(min_inflight, math.floor(value))
		if not inflight_seen[v] then
			inflight_seen[v] = true
			inflight_options[#inflight_options + 1] = v
		end
	end
	add_inflight_option(engine_capacity / 3)
	add_inflight_option(engine_capacity * 2 / 3)
	add_inflight_option(engine_capacity)
	return inflight_options
end



---Measure a full batch cycle (read + serialize trees) in milliseconds.
---@param k integer
---@param center vector
---@param size integer
---@return number
function measure_batch_ms(k, center, size)
	local nodes = k * size + 1
	local half = math.floor(nodes / 2)
	local minp = vector.new(center.x - half, center.y - half, center.z - half)
	local maxp = vector.new(minp.x + nodes - 1, minp.y + nodes - 1, minp.z + nodes - 1)
	local padded_min = vector.subtract(minp, size / 2)
	local padded_max = vector.add(maxp, size / 2)

	local t0 = core.get_us_time()

	core.load_area(padded_min, padded_max)
	local manip = core.get_voxel_manip()
	local emerged_pos1, emerged_pos2 = manip:read_from_map(padded_min, padded_max)
	local area = VoxelArea(emerged_pos1, emerged_pos2)
	local data = {}
	local param2_data = {}
	manip:get_data(data)
	manip:get_param2_data(param2_data)

	for bx = 0, k - 1 do
		for by = 0, k - 1 do
			for bz = 0, k - 1 do
				local tree_center = vector.new(
					minp.x + bx * size,
					minp.y + by * size,
					minp.z + bz * size
				)
				tree_center = octchunk.snap_to_center(tree_center)
				local new_tree = {center = tree_center, size = size}
				octchunk.populate_tree_from_area(new_tree, area, data, param2_data)
			end
		end
	end

	local dt = core.get_us_time() - t0
	manip:close()
	return dt / 1000
end



---@param center vector
---@param size integer
---@param budget_ms number
---@return number preflight_ms
---@return integer test_trees_per_axis
function run_preflight(center, size, budget_ms)
	local preflight_k = 4
	local preflight_samples = {}
	for _ = 1, 2 do
		preflight_samples[#preflight_samples + 1] = measure_batch_ms(preflight_k, center, size)
	end
	table.sort(preflight_samples)
	local preflight_ms = preflight_samples[#preflight_samples]

	local k_max_est = preflight_k
	if budget_ms > 0 and preflight_ms > 0 then
		k_max_est = math.floor(preflight_k * (budget_ms / preflight_ms) ^ (1 / 3))
	end

	local test_trees_per_axis = math.min(28, math.max(6, 3 * k_max_est))
	return preflight_ms, test_trees_per_axis
end



---@param center vector
---@param size integer
---@param test_trees_per_axis integer
---@return integer half_extent
---@return integer trees_x
---@return integer trees_y
---@return integer trees_z
---@return integer total_test_trees
function compute_test_area(center, size, test_trees_per_axis)
	local half_extent = math.floor((test_trees_per_axis * size) / 2)
	local test_pos1 = vector.new(center.x - half_extent, center.y - half_extent, center.z - half_extent)
	local test_pos2 = vector.new(center.x + half_extent - 1, center.y + half_extent - 1, center.z + half_extent - 1)

	local snapped1 = octchunk.snap_to_center(test_pos1)
	local snapped2 = octchunk.snap_to_center(test_pos2)
	local trees_x = math.floor((snapped2.x - snapped1.x) / size) + 1
	local trees_y = math.floor((snapped2.y - snapped1.y) / size) + 1
	local trees_z = math.floor((snapped2.z - snapped1.z) / size) + 1
	local total_test_trees = trees_x * trees_y * trees_z

	return half_extent, trees_x, trees_y, trees_z, total_test_trees
end



---@param preflight_ms number
---@param budget_ms number
---@param test_trees_per_axis integer
---@return integer[]
function build_configs(preflight_ms, budget_ms, test_trees_per_axis)
	local configs = {4, 5, 6, 7, 8, 10, 12, 16, 20, 28}
	if preflight_ms > budget_ms or test_trees_per_axis <= 12 then
		configs = {2, 3, 4}
	end
	return configs
end



---@param configs integer[]
---@return table
function build_area_offsets(configs)
	local area_offsets = {}
	local offset_patterns = {
		{x = 0,     y = 0, z = 0},
		{x = 2048,  y = 0, z = 0},
		{x = -2048, y = 0, z = 0},
		{x = 0,     y = 0, z = 2048},
		{x = 0,     y = 0, z = -2048},
		{x = 2048,  y = 0, z = 2048},
		{x = -2048, y = 0, z = -2048},
		{x = 4096,  y = 0, z = 0},
		{x = -4096, y = 0, z = 0},
		{x = 0,     y = 0, z = 4096},
		{x = 0,     y = 0, z = -4096},
		{x = 4096,  y = 0, z = 4096},
	}
	for i, k in ipairs(configs) do
		area_offsets[k] = offset_patterns[i] or {x = i * 2048, y = 0, z = 0}
	end
	return area_offsets
end



function send_intro_messages(
	name,
	budget_ms,
	total_test_trees,
	trees_x,
	trees_y,
	trees_z,
	test_trees_per_axis,
	preflight_ms,
	inflight_options,
	configs
)
	core.chat_send_player(name, S(
		"[octree_autotune] Empirical benchmark: budget=@1ms, test_area=@2 trees (@3x@4x@5)",
		budget_ms, total_test_trees, trees_x, trees_y, trees_z
	))
	if test_trees_per_axis < 28 then
		core.chat_send_player(name, S(
			"[octree_autotune] Area scaled to @1x@2x@3 (preflight k=4 batch=@4ms)",
			test_trees_per_axis,
			test_trees_per_axis,
			test_trees_per_axis,
			string.format("%.0f", preflight_ms)
		))
	end
	core.chat_send_player(name, S("[octree_autotune] k = trees per axis in the batch (batch = k^3 trees)"))
	core.chat_send_player(name, S("[octree_autotune] Inflight options: @1", table.concat(inflight_options, ", ")))
	core.chat_send_player(name, S("[octree_autotune] Testing configs: k=@1 (separate areas to avoid cache)", table.concat(configs, ", ")))
end



---Run the next config in the autotune sequence.
---@param ctx table
function run_next_config(ctx)
	if ctx.budget_exceeded then
		finish_benchmark(ctx)
		return
	end
	ctx.config_idx = ctx.config_idx + 1
	if ctx.config_idx > #ctx.configs then
		finish_benchmark(ctx)
		return
	end

	local k = ctx.configs[ctx.config_idx]
	local nodes = k * ctx.size + 1
	local volume = nodes * nodes * nodes

	-- Use different area for each k to avoid cache contamination
	local offset = ctx.area_offsets[k] or {x = 0, y = 0, z = 0}
	local test_center = vector.new(
		ctx.ppos.x + offset.x,
		ctx.ppos.y + offset.y,
		ctx.ppos.z + offset.z
	)
	local k_test_pos1 = vector.new(
		test_center.x - ctx.half_extent,
		test_center.y - ctx.half_extent,
		test_center.z - ctx.half_extent
	)
	local k_test_pos2 = vector.new(
		test_center.x + ctx.half_extent - 1,
		test_center.y + ctx.half_extent - 1,
		test_center.z + ctx.half_extent - 1
	)

	local batch_samples = {}
	for _ = 1, 3 do
		batch_samples[#batch_samples + 1] = measure_batch_ms(k, test_center, ctx.size)
	end
	table.sort(batch_samples)
	local batch_p95_ms = batch_samples[#batch_samples]

	-- Skip if batch exceeds budget
	if batch_p95_ms > ctx.budget_ms then
		core.chat_send_player(ctx.name, S(
			"  k=@1: SKIP (batch=@2ms > budget=@3ms)",
			k, string.format("%.0f", batch_p95_ms), ctx.budget_ms
		))
		ctx.budget_exceeded = true
		core.chat_send_player(ctx.name, S("[octree_autotune] Budget exceeded. Skipping remaining configs."))
		core.after(0.1, function() finish_benchmark(ctx) end)
		return
	end

	ctx.current = {
		k = k,
		volume = volume,
		k_test_pos1 = k_test_pos1,
		k_test_pos2 = k_test_pos2,
		batch_p95_ms = batch_p95_ms,
	}
	ctx.inflight_idx = 0

	run_next_inflight(ctx)
end



---@param ctx table
function run_next_inflight(ctx)
	ctx.inflight_idx = ctx.inflight_idx + 1
	if ctx.inflight_idx > #ctx.inflight_options then
		core.after(0.2, function() run_next_config(ctx) end)
		return
	end

	local inflight = ctx.inflight_options[ctx.inflight_idx]
	local cur = ctx.current
	core.chat_send_player(ctx.name, S(
		"  k=@1: batch=@2ms, inflight=@3, testing...",
		cur.k, string.format("%.0f", cur.batch_p95_ms), inflight
	))

	local start_time = core.get_us_time()
	map_octree.save_to_file_async(cur.k_test_pos1, cur.k_test_pos2, {
		file_name = "_autotune_test",
		subdir = "bench",
		async_inflight = inflight,
		read_budget_ms = ctx.budget_ms,
		max_voxelmanip_volume = cur.volume,
		flush_async_workers = false,
	}, function(ok, result, stats)
		handle_inflight_result(ctx, inflight, start_time, ok, result, stats)
	end)
end



---@param ctx table
---@param inflight integer
---@param start_time integer
---@param ok boolean
---@param result table
---@param stats table
function handle_inflight_result(ctx, inflight, start_time, ok, result, stats)
	local cur = ctx.current
	local elapsed_ms = (core.get_us_time() - start_time) / 1000
	if not ok then
		core.chat_send_player(ctx.name, S(
			"  k=@1: FAILED - @2",
			cur.k, tostring(result)
		))
		core.after(0.1, function() run_next_inflight(ctx) end)
		return
	end

	local chunk_count = result.trees.size.x * result.trees.size.y * result.trees.size.z
	local throughput = chunk_count / elapsed_ms * 1000
	local batch_count = (stats and stats.batch_count) or 0
	if batch_count <= 0 then
		local batch_size = cur.k * cur.k * cur.k
		batch_count = math.max(1, math.ceil(chunk_count / batch_size))
	end
	local expected_ms = (batch_count / inflight) * ctx.budget_ms
	local lag_ratio = elapsed_ms / math.max(1, expected_ms)
	local eff_throughput = throughput / math.max(1, lag_ratio)
	local lag_pct = math.max(0, (lag_ratio - 1) * 100)

	ctx.results[#ctx.results + 1] = {
		k = cur.k,
		volume = cur.volume,
		inflight = inflight,
		batch_ms = cur.batch_p95_ms,
		elapsed_ms = elapsed_ms,
		chunk_count = chunk_count,
		batch_count = batch_count,
		expected_ms = expected_ms,
		throughput = throughput,
		eff_throughput = eff_throughput,
		lag_ratio = lag_ratio,
	}

	core.chat_send_player(ctx.name, S(
		"  k=@1: @2 chunks in @3s → @4 chunks/s (eff=@5, inflight=@6, lag_vs_budget=@7%)",
		cur.k,
		chunk_count,
		string.format("%.1f", elapsed_ms / 1000),
		string.format("%.0f", throughput),
		string.format("%.0f", eff_throughput),
		inflight,
		string.format("%.0f", lag_pct)
	))

	collectgarbage("collect")
	core.after(0.2, function() run_next_inflight(ctx) end)
end



---@param ctx table
function finish_benchmark(ctx)
	map_octree._autotune_running = false

	-- Delete test file
	local bench_dir
	if map_octree.get_storage_dir then
		bench_dir = map_octree.get_storage_dir("bench")
	else
		bench_dir = core.get_worldpath() .. "/map_octree/bench"
	end
	os.remove(bench_dir .. "/_autotune_test.bin")

	if #ctx.results == 0 then
		core.chat_send_player(ctx.name, S("[octree_autotune] No valid configurations found!"))
		return
	end

	local best = nil
	for _, r in ipairs(ctx.results) do
		if not best or r.eff_throughput > best.eff_throughput then
			best = r
		end
	end

	local ref_trees = 67000
	local est_time_s = ref_trees / best.eff_throughput

	core.chat_send_player(ctx.name, "")
	core.chat_send_player(ctx.name, S("[octree_autotune] === RESULTS ==="))
	for _, r in ipairs(ctx.results) do
		local marker = (r == best) and " <<<" or ""
		core.chat_send_player(ctx.name, S(
			"  k=@1: @2 trees/s (eff=@3, batch=@4ms, inflight=@5, lag_vs_budget=@6%)@7",
			r.k,
			string.format("%.0f", r.throughput),
			string.format("%.0f", r.eff_throughput),
			string.format("%.0f", r.batch_ms),
			r.inflight,
			string.format("%.0f", math.max(0, (r.lag_ratio - 1) * 100)),
			marker
		))
	end

	core.chat_send_player(ctx.name, S(
		"[octree_autotune] BEST: k=@1, volume=@2, inflight=@3 → @4 trees/s (eff=@5)",
		best.k, best.volume, best.inflight, string.format("%.0f", best.throughput), string.format("%.0f", best.eff_throughput)
	))
	core.chat_send_player(ctx.name, S(
		"[octree_autotune] Projected: @1k trees in ~@2s",
		math.floor(ref_trees / 1000), string.format("%.0f", est_time_s)
	))

	local all_at_cap = true
	for _, r in ipairs(ctx.results) do
		if r.inflight < ctx.engine_capacity then
			all_at_cap = false
			break
		end
	end
	if all_at_cap then
		core.chat_send_player(ctx.name, S(
			"[octree_autotune] NOTE: CPU-bound (@1 async workers = num_cpus-2). This is your hardware limit.",
			ctx.engine_capacity
		))
	end

	core.settings:set("map_octree_max_voxelmanip_volume", tostring(best.volume))
	core.settings:set("map_octree_read_budget_ms", tostring(ctx.budget_ms))

	if best.inflight < ctx.engine_capacity then
		core.settings:set("map_octree_async_inflight_override", tostring(best.inflight))
		core.chat_send_player(ctx.name, S(
			"[octree_autotune] Applied: volume=@1, budget=@2ms, inflight=@3 (override: fewer workers faster)",
			best.volume, ctx.budget_ms, best.inflight
		))
	else
		core.settings:set("map_octree_async_inflight_override", "0")
		core.chat_send_player(ctx.name, S(
			"[octree_autotune] Applied: volume=@1, budget=@2ms, inflight=@3 (from CPU count)",
			best.volume, ctx.budget_ms, best.inflight
		))
	end

	core.settings:write()
end
