#!/usr/bin/php
<?php
/**
* Fix charset encoding of directories and files
* - converts to utf8
* - fixes some garbled german umlauts
*
* Current status: needs more testing with different charsets
*
* 2018-10-14 by klemens.ullmann-marx@ull.at
*/
// Throw execeptions for notices and warnings
set_error_handler('errorHandler');
// ARGUMENTS
if (!isset($argv[1])) {
$script = basename($argv[0]);
echo "Usage:\n $script '/your/path' [--confirmation]\n";
exit(1);
}
$path = $argv[1];
$dryRun = true;
if (isset($argv[2]) && $argv[2] == '--confirmation') {
$dryRun = false;
}
//var_dump($path, $dryRun);die;
//$path = '/srv/nextcloud/klemens/files/Mullarx-Musik/Björk';
//$path = '/srv/nextcloud/klemens/files/Mullarx-Musik';
//$path = '/srv/nextcloud/klemens/files';
/*
hexdump('u?', false);
hexdump(hex2bin('75cc88'), false);
die;
*/
// FIX DIRS
$cmd = 'find ' . $path . ' -type d' ;
$result = trim(shell_exec($cmd));
$files = explode("\n", $result);
foreach ($files as $file) {
fix($file, $dryRun);
}
// FIX FILES
$cmd = 'find ' . $path . ' -type f' ;
$result = trim(shell_exec($cmd));
$files = explode("\n", $result);
// ISO-8859 umlaut ü test
//array_unshift($files, 'immer brav ' . hex2bin('fc') . 'ben!');
foreach ($files as $file) {
fix($file, $dryRun);
}
echo "\n\n\n\n";
// FUNCTION
function fix($file, $dryRun) {
// Special handling for directories
$isDir = false;
if (is_dir($file)) {
$filename = $file;
$isDir = true;
} else {
$filename = basename($file);
}
$fixed = $filename;
$tests = array();
// GARBLED CHAR BEFORE CHARSET DETECTION
// charset detection does not work for garbled chars.
$garbledBefore = array(
hex2bin('c281') => 'ü',
hex2bin('c284') => 'ä',
hex2bin('c294') => 'ö',
hex2bin('c3a1') => 'ß',
);
foreach ($garbledBefore as $garble => $replacement) {
if (strstr($fixed, $garble)) {
$fixed = str_replace($garble, $replacement, $fixed);
$tests['garbled_before_charset_conversion'] = "Replaced garble char '$garble' (" . bin2hex($garble) . ") into '$replacement' (". bin2hex($replacement) . ")\n";
}
}
// CHARSET CONVERSION
// uchardet is very unreliable, eg reports utf8 "ü" as WINDOWS-1258
// deactivated at the moment.
/*
$cmd = 'echo "' . $fixed . '" | uchardet';
$charset = trim(shell_exec($cmd));
if (!in_array($charset, array('ASCII', 'UTF-8'))) {
$fixed = iconv($charset, 'UTF-8', $fixed);
$tests['charset'] = 'Detected charset by uchardet: ' . $charset . " ";
$tests['charset'] .= 'Converted to UTF-8: ' . $fixed . "\n";
}
*/
// with "file" command, and return only the first word (cut)
// exclude "magic" file types like "core file (Xenix)" for "core_network.gif"
$cmd = 'echo "' . $fixed . '" | file -b -e soft - | cut -f 1 -d " "';
//$cmd = 'echo "' . $fixed . '" | file -b - ';
$charset = trim(shell_exec($cmd));
$okCharsets = array(
'ASCII',
'UTF-8',
);
if (!in_array($charset, $okCharsets)) {
// fix "file" command charset names
$charset = str_replace('ISO-8859', 'ISO-8859-15', $charset);
// catch illegal "file" charsets and allow debugging
try {
$fixed = iconv($charset, 'UTF-8', $fixed);
} catch (Exception $e) {
var_dump($file);
var_dump($fixed);
var_dump('Wrong charset found. "file" output: ' . shell_exec('echo "' . $fixed . '" | file -b -e soft -'));
throw $e;
}
$tests['charset'] = 'Detected charset by "file" command: ' . $charset . " ";
$tests['charset'] .= 'Converted to UTF-8: ' . $fixed . "\n";
}
// TEST FOR TREMAS
$tremas = array(
hex2bin('61cc88') => 'ä',
hex2bin('6Fcc88') => 'ö',
hex2bin('75cc88') => 'ü',
hex2bin('41cc88') => 'Ä',
hex2bin('4Fcc88') => 'Ö',
hex2bin('55cc88') => 'Ü',
);
foreach ($tremas as $trema => $umlaut) {
if (strstr($filename, $trema)) {
$tests['trema'] = 'Trema detected: ' . bin2hex($trema) . '(' . $umlaut . ")\n";
}
}
// 'INVALID ENCODING' STRING IN FILENAME
if (strstr($filename, ' (ungültige Kodierung)')) {
$tests['invalid_encoding_string'] = 'Invalid string detected: "(ungültige Kodierung)"'. "\n";
}
// GARBLED CHARS
$garbled = array(
'”' => 'ö',
'„' => 'ä',
'™' => 'Ö',
'´' => '',
'‚' => '', // not a normal comma!
'?' => 'ö',
'' => 'ü',
'á' => 'ß',
'?' => 'ä',
'?' => 'ö',
' (ungültige Kodierung)' => '',
) + $tremas;
foreach ($garbled as $garble => $replacement) {
if (strstr($fixed, $garble)) {
$fixed = str_replace($garble, $replacement, $fixed);
$tests['garbled'] = "Replaced garble char '$garble' (" . bin2hex($garble) . ") into '$replacement' (". bin2hex($replacement) . ")\n";
}
}
// Skip ok files
if ($tests) {
if ($isDir) {
echo "\n############## DIR ##################";
}
echo "\n>>>>> " . $file . "\n";
// HEX
echo "[hex_filename] \n" . hexdump($filename, false, false, true);
foreach ($tests as $type => $text) {
echo "[$type] \t" . $text;
}
//hexdump($filename, false);
// NOW RENAME THE FILES
$cmd = 'mv ' . escapeshellargMB($file) . ' ' . escapeshellargMB(str_replace($filename, $fixed, $file));
echo $cmd . "\n";
if ($dryRun) {
echo "Dry run! No Change made.\n";
} else {
echo shell_exec($cmd) . "\n";
}
echo "\n";
} else {
// render dots for progress;
echo ".";
}
}
// Do not remove multibyte chars like escapeshellarg ()
function escapeshellargMB($string) {
$string = str_replace("'", "'\\''", $string);
return "'$string'";
}
/**
* View any string as a hexdump.
*
* This is most commonly used to view binary data from streams
* or sockets while debugging, but can be used to view any string
* with non-viewable characters.
*
* @version 1.3.2
* @author Aidan Lister <aidan@php.net>
* @author Peter Waller <iridum@php.net>
* @link http://aidanlister.com/2004/04/viewing-binary-data-as-a-hexdump-in-php/
* @param string $data The string to be dumped
* @param bool $htmloutput Set to false for non-HTML output
* @param bool $uppercase Set to true for uppercase hex
* @param bool $return Set to true to return the dump
*/
function hexdump ($data, $htmloutput = true, $uppercase = false, $return = false)
{
// Init
$hexi = '';
$ascii = '';
$dump = ($htmloutput === true) ? '<pre>' : '';
$offset = 0;
$len = strlen($data);
// Upper or lower case hexadecimal
$x = ($uppercase === false) ? 'x' : 'X';
// Iterate string
for ($i = $j = 0; $i < $len; $i++)
{
// Convert to hexidecimal
$hexi .= sprintf("%02$x ", ord($data[$i]));
// Replace non-viewable bytes with '.'
if (ord($data[$i]) >= 32) {
$ascii .= ($htmloutput === true) ?
htmlentities($data[$i]) :
$data[$i];
} else {
$ascii .= '.';
}
// Add extra column spacing
if ($j === 7) {
$hexi .= ' ';
$ascii .= ' ';
}
// Add row
if (++$j === 16 || $i === $len - 1) {
// Join the hexi / ascii output
$dump .= sprintf("%04$x %-49s %s", $offset, $hexi, $ascii);
// Reset vars
$hexi = $ascii = '';
$offset += 16;
$j = 0;
// Add newline
if ($i !== $len - 1) {
$dump .= "\n";
}
}
}
// Finish dump
$dump .= $htmloutput === true ?
'</pre>' :
'';
$dump .= "\n";
// Output method
if ($return === false) {
echo $dump;
} else {
return $dump;
}
}
function errorHandler($errNo, $errStr, $errFile, $errLine) {
$msg = "$errStr in $errFile on line $errLine";
if ($errNo == E_NOTICE || $errNo == E_WARNING) {
throw new ErrorException($msg, $errNo);
} else {
echo $msg;
}
}