import re import sys def convert_mysql_to_pg(input_file, output_file): with open(input_file, 'r', encoding='utf-8') as f: content = f.read() # 1. Pre-process: Remove comments and basic cleanup lines = content.splitlines() filtered_lines = [] for line in lines: stripped = line.strip() if stripped.startswith('/*') or stripped.startswith('--') or stripped == '': continue if stripped.startswith('SET ') or stripped.startswith('LOCK TABLES') or stripped.startswith('UNLOCK TABLES'): continue filtered_lines.append(line) content = '\n'.join(filtered_lines) # 2. Global replacements (safe ones) # Backticks to double quotes content = content.replace('`', '"') # 3. Line-by-line processing for schema definitions lines = content.splitlines() final_lines = [] current_table = None deferred_indexes = [] deferred_fks = [] for line in lines: stripped = line.strip() # Track current table table_match = re.match(r'CREATE TABLE "(\w+)"', stripped) if table_match: current_table = table_match.group(1) # Identify if this line is likely a column definition # It should start with whitespace and a quoted identifier # And NOT be an INSERT statement is_column_def = stripped.startswith('"') and 'INSERT INTO' not in line if is_column_def: # Data types # tinyint(1) -> SMALLINT line = re.sub(r'tinyint\(1\)', 'SMALLINT', line, flags=re.IGNORECASE) # tinyint -> SMALLINT (catch-all for other widths or no width) line = re.sub(r'\btinyint(\(\d+\))?', 'SMALLINT', line, flags=re.IGNORECASE) line = re.sub(r'int\(\d+\)', 'INTEGER', line, flags=re.IGNORECASE) # Standalone int -> INTEGER (only in column defs) line = re.sub(r'\bint\b', 'INTEGER', line, flags=re.IGNORECASE) # datetime -> TIMESTAMP line = re.sub(r'\bdatetime\b', 'TIMESTAMP', line, flags=re.IGNORECASE) # Varchar case line = re.sub(r'varchar\(\d+\)', lambda m: m.group(0).upper(), line, flags=re.IGNORECASE) # Remove MySQL specific column attributes line = re.sub(r'\s+CHARACTER\s+SET\s+[\w]+', '', line, flags=re.IGNORECASE) line = re.sub(r'\s+COLLATE\s+[\w]+', '', line, flags=re.IGNORECASE) # AUTO_INCREMENT -> SERIAL # Pattern: "id" INTEGER NOT NULL AUTO_INCREMENT # We want: "id" SERIAL if 'AUTO_INCREMENT' in line: # Handle INTEGER line = re.sub(r'("[\w]+")\s+INTEGER\s+NOT\s+NULL\s+AUTO_INCREMENT', r'\1 SERIAL', line, flags=re.IGNORECASE) # Handle BIGINT line = re.sub(r'("[\w]+")\s+bigint\s+NOT\s+NULL\s+AUTO_INCREMENT', r'\1 BIGSERIAL', line, flags=re.IGNORECASE) # Remove AUTO_INCREMENT if still present (e.g. not matched above) line = re.sub(r'\s+AUTO_INCREMENT', '', line, flags=re.IGNORECASE) # Remove COMMENT line = re.sub(r"\s+COMMENT\s+'[^']*'", "", line, flags=re.IGNORECASE) # Remove ON UPDATE ... line = re.sub(r'\s+ON\s+UPDATE\s+CURRENT_TIMESTAMP', '', line, flags=re.IGNORECASE) # Handle Keys # PRIMARY KEY is usually fine: PRIMARY KEY ("id") # UNIQUE KEY "name" (...) -> CONSTRAINT "name" UNIQUE (...) if 'UNIQUE KEY' in line: line = re.sub(r'UNIQUE KEY\s+"(\w+)"\s+(\(.*\))', r'CONSTRAINT "\1" UNIQUE \2', line, flags=re.IGNORECASE) # KEY "name" (...) -> Extract to CREATE INDEX (skip PRIMARY, UNIQUE, FOREIGN) # MySQL: KEY "idx_name" ("col1", "col2") # Postgres: CREATE INDEX "idx_name" ON "table_name" ("col1", "col2"); if re.search(r'^\s*KEY\s+"', line) and 'PRIMARY' not in line and 'UNIQUE' not in line and 'FOREIGN' not in line: key_match = re.search(r'^\s*KEY\s+"(\w+)"\s+(\(.*\))', line) if key_match and current_table: idx_name = key_match.group(1) idx_cols = key_match.group(2) deferred_indexes.append(f'CREATE INDEX "{idx_name}" ON "{current_table}" {idx_cols};') continue # Skip this line in CREATE TABLE else: # Fallback if regex fails, just comment it out to avoid syntax error line = "-- " + line # CREATE TABLE line cleanup if stripped.startswith('CREATE TABLE'): # usually fine, but check for modifiers? pass # Foreign Key Cleanup if 'FOREIGN KEY' in line: # Remove db.table references like "nex_docus"."users" -> "users" line = re.sub(r'"[\w]+"\."([\w]+)"', r'"\1"', line) # Fix "users" -> "sys_user" if applicable line = line.replace('"users"', '"sys_user"') # Fix sys_user PK reference (id -> user_id) if 'REFERENCES "sys_user"' in line: line = line.replace('("id")', '("user_id")') # Extract CONSTRAINT definition to defer it # Remove trailing comma constraint_def = line.strip().rstrip(',') if current_table: deferred_fks.append(f'ALTER TABLE "{current_table}" ADD {constraint_def};') continue # Skip adding to CREATE TABLE # Remove USING BTREE line = re.sub(r'\s+USING\s+BTREE', '', line, flags=re.IGNORECASE) # End of table definition cleanup if stripped.startswith(') ENGINE='): line = ');' elif stripped.startswith(') DEFAULT CHARSET='): line = ');' elif ') ENGINE=' in line: line = re.sub(r'\)\s*ENGINE=[^;]+;', ');', line, flags=re.IGNORECASE) # Global string escaping for INSERTs if 'INSERT INTO' in line: line = line.replace(r'\"', '"') line = line.replace(r"\'", "''") # Ensure json type is spaced (if json keyword appears) if 'json' in line.lower() and is_column_def: line = re.sub(r'\bjson\b', 'JSON', line, flags=re.IGNORECASE) final_lines.append(line) # Append deferred indexes if deferred_indexes: final_lines.append("\n-- Deferred Indexes") final_lines.extend(deferred_indexes) # Append deferred FKs if deferred_fks: final_lines.append("\n-- Deferred Foreign Keys") final_lines.extend(deferred_fks) content = '\n'.join(final_lines) # Fix trailing commas before ); # Regex to find comma followed by newline and ); # Or just comma followed by whitespace and ); content = re.sub(r',\s*\);', ');', content) with open(output_file, 'w', encoding='utf-8') as f: f.write(content) if __name__ == '__main__': if len(sys.argv) != 3: print("Usage: python convert_sql.py ") sys.exit(1) convert_mysql_to_pg(sys.argv[1], sys.argv[2])