nex_basse/backend/scripts/convert_sql.py

import re
import sys

def convert_mysql_to_pg(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as f:
        content = f.read()

    # 1. Pre-process: Remove comments and basic cleanup
    lines = content.splitlines()
    filtered_lines = []

    for line in lines:
        stripped = line.strip()
        if stripped.startswith('/*') or stripped.startswith('--') or stripped == '':
            continue
        if stripped.startswith('SET ') or stripped.startswith('LOCK TABLES') or stripped.startswith('UNLOCK TABLES'):
            continue

        filtered_lines.append(line)

    content = '\n'.join(filtered_lines)

    # 2. Global replacements (safe ones)
    # Backticks to double quotes
    content = content.replace('`', '"')

    # 3. Line-by-line processing for schema definitions
    lines = content.splitlines()
    final_lines = []

    current_table = None
    deferred_indexes = []
    deferred_fks = []

    for line in lines:
        stripped = line.strip()

        # Track current table
        table_match = re.match(r'CREATE TABLE "(\w+)"', stripped)
        if table_match:
            current_table = table_match.group(1)

        # Identify if this line is likely a column definition
        # It should start with whitespace and a quoted identifier
        # And NOT be an INSERT statement
        is_column_def = stripped.startswith('"') and 'INSERT INTO' not in line

        if is_column_def:
            # Data types
            # tinyint(1) -> SMALLINT
            line = re.sub(r'tinyint\(1\)', 'SMALLINT', line, flags=re.IGNORECASE)
            # tinyint -> SMALLINT (catch-all for other widths or no width)
            line = re.sub(r'\btinyint(\(\d+\))?', 'SMALLINT', line, flags=re.IGNORECASE)

            line = re.sub(r'int\(\d+\)', 'INTEGER', line, flags=re.IGNORECASE)
            # Standalone int -> INTEGER (only in column defs)
            line = re.sub(r'\bint\b', 'INTEGER', line, flags=re.IGNORECASE)
            # datetime -> TIMESTAMP
            line = re.sub(r'\bdatetime\b', 'TIMESTAMP', line, flags=re.IGNORECASE)

            # Varchar case
            line = re.sub(r'varchar\(\d+\)', lambda m: m.group(0).upper(), line, flags=re.IGNORECASE)

            # Remove MySQL specific column attributes
            line = re.sub(r'\s+CHARACTER\s+SET\s+[\w]+', '', line, flags=re.IGNORECASE)
            line = re.sub(r'\s+COLLATE\s+[\w]+', '', line, flags=re.IGNORECASE)

            # AUTO_INCREMENT -> SERIAL
            # Pattern: "id" INTEGER NOT NULL AUTO_INCREMENT
            # We want: "id" SERIAL
            if 'AUTO_INCREMENT' in line:
                # Handle INTEGER
                line = re.sub(r'("[\w]+")\s+INTEGER\s+NOT\s+NULL\s+AUTO_INCREMENT', r'\1 SERIAL', line, flags=re.IGNORECASE)
                # Handle BIGINT
                line = re.sub(r'("[\w]+")\s+bigint\s+NOT\s+NULL\s+AUTO_INCREMENT', r'\1 BIGSERIAL', line, flags=re.IGNORECASE)
                # Remove AUTO_INCREMENT if still present (e.g. not matched above)
                line = re.sub(r'\s+AUTO_INCREMENT', '', line, flags=re.IGNORECASE)

            # Remove COMMENT
            line = re.sub(r"\s+COMMENT\s+'[^']*'", "", line, flags=re.IGNORECASE)

            # Remove ON UPDATE ...
            line = re.sub(r'\s+ON\s+UPDATE\s+CURRENT_TIMESTAMP', '', line, flags=re.IGNORECASE)

        # Handle Keys
        # PRIMARY KEY is usually fine: PRIMARY KEY ("id")

        # UNIQUE KEY "name" (...) -> CONSTRAINT "name" UNIQUE (...)
        if 'UNIQUE KEY' in line:
            line = re.sub(r'UNIQUE KEY\s+"(\w+)"\s+(\(.*\))', r'CONSTRAINT "\1" UNIQUE \2', line, flags=re.IGNORECASE)

        # KEY "name" (...) -> Extract to CREATE INDEX (skip PRIMARY, UNIQUE, FOREIGN)
        # MySQL: KEY "idx_name" ("col1", "col2")
        # Postgres: CREATE INDEX "idx_name" ON "table_name" ("col1", "col2");
        if re.search(r'^\s*KEY\s+"', line) and 'PRIMARY' not in line and 'UNIQUE' not in line and 'FOREIGN' not in line:
            key_match = re.search(r'^\s*KEY\s+"(\w+)"\s+(\(.*\))', line)
            if key_match and current_table:
                idx_name = key_match.group(1)
                idx_cols = key_match.group(2)
                deferred_indexes.append(f'CREATE INDEX "{idx_name}" ON "{current_table}" {idx_cols};')
                continue # Skip this line in CREATE TABLE
            else:
                # Fallback if regex fails, just comment it out to avoid syntax error
                line = "-- " + line

        # CREATE TABLE line cleanup
        if stripped.startswith('CREATE TABLE'):
             # usually fine, but check for modifiers?
             pass

        # Foreign Key Cleanup
        if 'FOREIGN KEY' in line:
            # Remove db.table references like "nex_docus"."users" -> "users"
            line = re.sub(r'"[\w]+"\."([\w]+)"', r'"\1"', line)
            # Fix "users" -> "sys_user" if applicable
            line = line.replace('"users"', '"sys_user"')

            # Fix sys_user PK reference (id -> user_id)
            if 'REFERENCES "sys_user"' in line:
                line = line.replace('("id")', '("user_id")')

            # Extract CONSTRAINT definition to defer it
            # Remove trailing comma
            constraint_def = line.strip().rstrip(',')
            if current_table:
                deferred_fks.append(f'ALTER TABLE "{current_table}" ADD {constraint_def};')
            continue # Skip adding to CREATE TABLE

        # Remove USING BTREE
        line = re.sub(r'\s+USING\s+BTREE', '', line, flags=re.IGNORECASE)

        # End of table definition cleanup
        if stripped.startswith(') ENGINE='):
            line = ');'
        elif stripped.startswith(') DEFAULT CHARSET='):
            line = ');'
        elif ') ENGINE=' in line:
            line = re.sub(r'\)\s*ENGINE=[^;]+;', ');', line, flags=re.IGNORECASE)

        # Global string escaping for INSERTs
        if 'INSERT INTO' in line:
            line = line.replace(r'\"', '"')
            line = line.replace(r"\'", "''")

        # Ensure json type is spaced (if json keyword appears)
        if 'json' in line.lower() and is_column_def:
             line = re.sub(r'\bjson\b', 'JSON', line, flags=re.IGNORECASE)

        final_lines.append(line)

    # Append deferred indexes
    if deferred_indexes:
        final_lines.append("\n-- Deferred Indexes")
        final_lines.extend(deferred_indexes)

    # Append deferred FKs
    if deferred_fks:
        final_lines.append("\n-- Deferred Foreign Keys")
        final_lines.extend(deferred_fks)

    content = '\n'.join(final_lines)

    # Fix trailing commas before );
    # Regex to find comma followed by newline and );
    # Or just comma followed by whitespace and );
    content = re.sub(r',\s*\);', ');', content)

    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(content)

if __name__ == '__main__':
    if len(sys.argv) != 3:
        print("Usage: python convert_sql.py <input_file> <output_file>")
        sys.exit(1)

    convert_mysql_to_pg(sys.argv[1], sys.argv[2])