Files
thetool/scripts/ipnetwork/initial-data.php
2025-08-21 14:34:42 +02:00

276 lines
12 KiB
PHP

<?php
/**
* This script parses a wiki text file containing network information and generates an SQL script
* to populate the IpNetwork table with a complete, hierarchical representation of the network.
*
* It includes on-the-fly data cleaning, hierarchical network detection from headers, robust parent-child logic,
* duplicate prevention, IP address validation, and error logging to 'initial_data_errors.txt'.
*
* @version 7.0
* @author Gemini
*/
// --- Configuration ---
$inputFile = 'combined_wiki.txt';
$outputFile = 'initial_data.sql'; // Changed: Standardized output filename
$dbTableName = 'IpNetwork';
$errorFile = 'initial_data_errors.txt';
// --- Main Execution ---
$parsing_errors = []; // Initialize error log array
// Delete old error log if it exists
if (file_exists($errorFile)) {
unlink($errorFile);
}
// Read and parse the wiki file into a structured PHP array
$networkData = parseWikiFile($inputFile, $parsing_errors);
// Generate the SQL script from the parsed data
$sqlScript = generateSqlScript($networkData, $dbTableName, $parsing_errors);
// Save the generated SQL script to the output file
file_put_contents($outputFile, $sqlScript);
echo "Final SQL script was successfully generated in '$outputFile'\n";
// Save errors if any were found
if (!empty($parsing_errors)) {
$error_log_content = "Parsing process found the following issues:\n\n";
$error_log_content .= implode("\n", $parsing_errors);
file_put_contents($errorFile, $error_log_content);
echo "Found " . count($parsing_errors) . " issues during processing. See $errorFile for details.\n";
}
/**
* Parses the wiki text file, cleans data, validates IPs, checks for duplicates, and organizes the network data.
* It now also creates parent network blocks based on file headers.
*
* @param string $filename The path to the wiki text file.
* @param array &$errors Array to store logging information.
* @return array A flat list of unique, valid network/host entries.
*/
function parseWikiFile(string $filename, array &$errors): array
{
if (!file_exists($filename)) {
die("Error: Input file '$filename' not found.\n");
}
$lines = file($filename, FILE_IGNORE_NEW_LINES);
$entries = [];
$seen_networks = []; // Tracker for duplicates
$pending_description = '';
$ignore_section = false;
foreach ($lines as $line_number => $line) {
// 1. Clean up the line from multiple inconsistencies
$cleaned_line = html_entity_decode($line, ENT_QUOTES | ENT_HTML5);
$cleaned_line = str_replace(['\\', '&nbsp;'], ['', ' '], $cleaned_line);
$cleaned_line = preg_replace('/\[([^\]]+)\]\(mailto:[^\)]+\)/', '$1', $cleaned_line);
$cleaned_line = trim(preg_replace('/\s+/', ' ', $cleaned_line));
// 2. Check for section headers (START, END, etc.)
if (str_starts_with($cleaned_line, '=====')) {
$pending_description = ''; // Reset for any new section
$ignore_section = (stripos($line, 'gelöschte IP Netze') !== false);
// IMPROVEMENT: Specifically parse START headers for network definitions
if (!$ignore_section && preg_match('/^===== START\s+([0-9\.]+)(?:[\/-](\d{1,2}))?\s*(.*?)(?:\.md)?\s*=====$/', $cleaned_line, $header_matches)) {
$ip_from_header = $header_matches[1];
$cidr_from_header = $header_matches[2] ?? null;
$desc_from_header = trim($header_matches[3]);
// Heuristic: If CIDR is missing but IP ends in .0, assume it's a /24 network block.
if ($cidr_from_header === null && preg_match('/\.0$/', $ip_from_header)) {
$cidr_from_header = 24;
if (empty($desc_from_header) || strtolower($desc_from_header) === 'linknetze') {
$desc_from_header = "Network Block {$ip_from_header}";
}
}
// If a full network/CIDR is defined or inferred from the header, add it as a distinct entry.
if ($cidr_from_header !== null && filter_var($ip_from_header, FILTER_VALIDATE_IP, FILTER_FLAG_IPV4)) {
$network_key = "$ip_from_header/$cidr_from_header";
if (!isset($seen_networks[$network_key])) {
$entries[] = [
'ip' => $ip_from_header,
'cidr' => (int)$cidr_from_header,
'name' => $desc_from_header,
'description' => $desc_from_header,
];
$seen_networks[$network_key] = true;
}
}
}
continue; // Header processed, move to next line
}
if ($ignore_section || empty($cleaned_line)) {
if (empty($cleaned_line)) $pending_description = '';
continue;
}
// 3. Handle special HTML table data
if (strpos($line, '<td') !== false) {
preg_match_all('/<td[^>]*>([\d\.]+)<\/td>/', $line, $ip_matches);
if (!empty($ip_matches[1])) {
foreach ($ip_matches[1] as $ip_from_table) {
if (filter_var($ip_from_table, FILTER_VALIDATE_IP, FILTER_FLAG_IPV4)) {
$entries[] = [ 'ip' => $ip_from_table, 'cidr' => 32, 'name' => 'CGNAT Host', 'description' => 'CGNAT Host from table' ];
}
}
}
$pending_description = '';
continue;
}
// 4. Try to match IP patterns
$is_ip_entry = false;
$entry_data = null;
if (preg_match('/^([0-9\.]+)\/(\d+)\s*(.*)$/', $cleaned_line, $matches)) {
$ip = $matches[1];
if (!filter_var($ip, FILTER_VALIDATE_IP, FILTER_FLAG_IPV4)) {
$errors[] = "Line " . ($line_number + 1) . ": Invalid IPv4 address in network definition: '$ip'. Skipping.";
$is_ip_entry = true;
} else {
$entry_data = ['ip' => $ip, 'cidr' => (int)$matches[2], 'desc' => $matches[3]];
}
} elseif (preg_match('/^([0-9\.]+)\s*(.*)$/', $cleaned_line, $matches)) {
$ip = $matches[1];
if (!filter_var($ip, FILTER_VALIDATE_IP, FILTER_FLAG_IPV4)) {
$is_ip_entry = false;
} else {
$entry_data = ['ip' => $ip, 'cidr' => 32, 'desc' => $matches[2]];
}
}
if (!$entry_data && preg_match('/^([0-9a-fA-F:]+)\/\d+/', $cleaned_line, $matches)) {
if (filter_var($matches[1], FILTER_VALIDATE_IP, FILTER_FLAG_IPV6)) {
$errors[] = "Line " . ($line_number + 1) . ": IPv6 address found and ignored: '$cleaned_line'.";
$is_ip_entry = true;
}
}
if ($entry_data) {
$is_ip_entry = true;
$network_key = "{$entry_data['ip']}/{$entry_data['cidr']}";
if (isset($seen_networks[$network_key])) {
$errors[] = "Line " . ($line_number + 1) . ": Duplicate entry in wiki file: '$network_key'. Skipping.";
} else {
$seen_networks[$network_key] = true;
$full_description = trim($pending_description . ' ' . $entry_data['desc']);
if ($entry_data['cidr'] === 32 && preg_match('/\b(frei|f r e i|reserve[d]?)\b/i', $full_description)) {
// It is a reserved entry, do not add it to the list.
} else {
$entries[] = [
'ip' => $entry_data['ip'],
'cidr' => $entry_data['cidr'],
'name' => $full_description,
'description' => $full_description,
];
}
}
}
if ($is_ip_entry) {
$pending_description = '';
} else {
$pending_description .= (empty($pending_description) ? '' : "\n") . $cleaned_line;
}
}
return $entries;
}
/**
* Generates the complete SQL script string, including additional parent blocks for better grouping.
*/
function generateSqlScript(array $data, string $tableName, array &$errors): string
{
$sql = "-- SQL Data for table '$tableName'\n";
$sql .= "-- Automatically generated from the Wiki documentation (Version 7.0)\n\n";
$sql .= "TRUNCATE TABLE `$tableName`;\n\n";
$sql .= "-- Parent Networks (Top-Level)\n";
$inserted_networks = [];
// Added more specific parent blocks to better organize the network hierarchy.
$parents = [
// Private RFC1918 Ranges
['10.0.0.0', 8, 'Private Network Class A', 'RFC 1918 private address range for large networks.'],
['172.16.0.0', 12, 'Private Network Class B', 'RFC 1918 private address range for medium networks.'],
['192.168.0.0', 16, 'Private Network Class C', 'RFC 1918 private address range for small networks.'],
// Public & Special Ranges
['100.64.0.0', 10, 'Carrier-Grade NAT (CGNAT)', 'RFC 6598 address range for Carrier-Grade NAT.'],
['5.206.200.0', 21, 'Public Network Block 5.206.200.0/21', 'Public IP address range.'],
['45.82.168.0', 22, 'Public Network Block 45.82.168.0/22', 'Public IP address range.'],
['46.151.200.0', 21, 'Public Network Block 46.151.200.0/21', 'Public IP address range.'],
['91.227.230.0', 22, 'Public Network Block 91.227.230.0/22', 'Public IP address range.'],
['91.227.236.0', 22, 'Public Network Block 91.227.236.0/22', 'Public IP address range.'],
['185.29.88.0', 22, 'Public Network Block 185.29.88.0/22', 'Public IP address range.'],
['192.254.252.0', 22, 'Public Network Block 192.254.252.0/22', 'Public IP address range.'],
['193.105.204.0', 22, 'Public Network Block 193.105.204.0/22', 'Public IP address range.'],
['193.186.244.0', 22, 'Public Network Block 193.186.244.0/22', 'Public IP address range.'],
['195.69.183.0', 24, 'Public Subnet 195.69.183.0/24', 'Public IP Subnet for KOLMI.'],
['195.191.252.0', 24, 'Public Subnet 195.191.252.0/24', 'Public IP Subnet for std Konzentrator.']
];
foreach ($parents as $p) {
$sql .= generateInsertStatement($tableName, $p[0], $p[1], 'NULL', 'active', $p[2], $p[3]);
$inserted_networks["{$p[0]}/{$p[1]}"] = true;
}
// Sort data to ensure parent networks are inserted before their children.
usort($data, function ($a, $b) {
if ($a['cidr'] != $b['cidr']) {
return $a['cidr'] <=> $b['cidr'];
}
return ip2long($a['ip']) <=> ip2long($b['ip']);
});
$sql .= "\n-- Networks and Hosts from Wiki Documentation (Hierarchical)\n";
foreach ($data as $network) {
$network_key = "{$network['ip']}/{$network['cidr']}";
if (isset($inserted_networks[$network_key])) {
$errors[] = "Duplicate network detected (already exists as a top-level parent): '$network_key'. Skipping INSERT.";
continue;
}
// Subquery to find the immediate parent network already in the table
$parentSelect = "(SELECT id FROM `$tableName` p WHERE " .
"INET_ATON('{$network['ip']}') >= p.network_address AND " .
"INET_ATON('{$network['ip']}') < (p.network_address + POWER(2, 32 - p.cidr)) AND " .
"p.cidr < {$network['cidr']} " .
"ORDER BY p.cidr DESC LIMIT 1)";
$sql .= generateInsertStatement($tableName, $network['ip'], $network['cidr'], $parentSelect, 'active', $network['name'], $network['description']);
$inserted_networks[$network_key] = true;
}
return $sql;
}
/**
* Generates a single SQL INSERT statement.
*/
function generateInsertStatement(string $tableName, string $ip, int $cidr, string $parentIdSql, string $status, string $name, string $description): string
{
$name = substr(addslashes($name), 0, 100);
$description = addslashes($description);
return "INSERT INTO `$tableName` (`network_address`, `cidr`, `parent_network_id`, `status`, `name`, `description`, `location`, `create`, `edit`) VALUES " .
"(INET_ATON('$ip'), $cidr, $parentIdSql, '$status', '$name', '$description', NULL, UNIX_TIMESTAMP(), UNIX_TIMESTAMP());\n";
}
?>