#!/usr/bin/env python3
# Copyright (c) 2026  1F616EMO
# SPDX-License-Identifier: LGPL-3.0-only
"""Fetch GitHub emoji API and produce a JSON mapping:
  unicode-id -> shortest name

Example output:
  {
	"1f9df-2642": "zombie_man",
	"1f4a4": "zzz"
  }
"""

import json
import os
import re
import sys
from urllib.request import Request, urlopen
from urllib.error import URLError, HTTPError


GITHUB_EMOJI_API = os.environ.get(
    "GITHUB_EMOJI_API", "https://api.github.com/emojis")

# Regex to capture the unicode id portion (e.g. 1f9df-2642)
RE = re.compile(
    r"^https?://github.githubassets.com/images/icons/emoji/unicode/([0-9a-f-]+)[.]png[?]v[0-9]+$")


def process_uid(uid: str) -> str:
    """Process uid according to rules:

    1. If there are no dashes in the uid: return as-is.
    2. If there are dashes and all components are in range 1f1e6..1f1ff: return as-is (flag sequence).
    3. If there are dashes and the first component is 1f3f4: append -e007f to the end.
    4. Otherwise, insert '200d' between each component (not at start or end).
    """
    parts = uid.split("-")
    if len(parts) == 1:
        return uid

    def hex_int(s: str):
        try:
            return int(s, 16)
        except Exception:
            return None

    vals = [hex_int(p) for p in parts]
    if all(v is not None and 0x1F1E6 <= v <= 0x1F1FF for v in vals):
        return uid

    if parts[0].lower() == "1f3f4":
        return uid

    # Insert 200d between components
    out_parts = []
    for i, p in enumerate(parts):
        if i > 0:
            out_parts.append("200d")
        out_parts.append(p)
    return "-".join(out_parts)


def fetch_json(url):
    req = Request(url, headers={"User-Agent": "strip_emoji-python/1"})
    try:
        with urlopen(req) as resp:
            return json.load(resp)
    except HTTPError as e:
        print(f"error: HTTP {e.code} fetching {url}", file=sys.stderr)
        raise
    except URLError as e:
        print(f"error: URL error fetching {url}: {e}", file=sys.stderr)
        raise


def build_mapping(data):
    # data is expected to be an object mapping name -> url
    out = {}
    for name, val in (data.items() if isinstance(data, dict) else []):
        if not isinstance(val, str):
            continue
        m = RE.match(val)
        if not m:
            continue
        uid = m.group(1)
        pid = process_uid(uid)
        # choose shortest name when duplicates occur
        if pid in out:
            if len(name) < len(out[pid]):
                out[pid] = name
        else:
            out[pid] = name
    return out


def generate_lua_output(mapping):
    yield '''-- strip_emoji/data/github_emoji_data.lua
-- Mapping of unicode points to GitHub Markdown syntaxes
-- Generated by generate_list.py; DO NOT EDIT MANUALLY!
-- Copyright (c) 2026  1F616EMO
-- SPDX-License-Identifier: LGPL-3.0-only

local codepoints = {}
'''

    tables_created = set()
    for codepoints_raw, name in mapping.items():
        codepoints = codepoints_raw.split("-")
        for i in range(0, len(codepoints)):
            full_codepoint_string = "-".join(codepoints[:i+1])
            if full_codepoint_string not in tables_created:
                yield 'codepoints[0x' + '][0x'.join(codepoints[:i+1]) + '] = {}' + "\n"
                tables_created.add(full_codepoint_string)
        yield 'codepoints[0x' + '][0x'.join(codepoints[:i+1]) + '].END = "' + name + "\"\n"

    yield "return codepoints\n"


def main():
    data = fetch_json(GITHUB_EMOJI_API)
    mapping = build_mapping(data)
    with open('data/github_emoji_data.lua', 'w', encoding='utf-8') as f:
        for data in generate_lua_output(mapping):
            f.write(data)


if __name__ == "__main__":
    main()
