• Edit
  • Delete

PHP CLI Script fix directories and file charset encodings with german umlauts

#!/usr/bin/php
<?php

/**
  * Fix charset encoding of directories and files
  * - converts to utf8
  * - fixes some garbled german umlauts
  *    
  * Current status: needs more testing with different charsets
  *
  * 2018-10-14 by klemens.ullmann-marx@ull.at
  */

// Throw execeptions for notices and warnings
set_error_handler('errorHandler');

// ARGUMENTS
if (!isset($argv[1])) {
    $script = basename($argv[0]);
    echo "Usage:\n    $script '/your/path' [--confirmation]\n";
    exit(1);
}
$path = $argv[1];

$dryRun = true;
if (isset($argv[2]) && $argv[2] == '--confirmation') {
    $dryRun = false;
}

//var_dump($path, $dryRun);die;

//$path = '/srv/nextcloud/klemens/files/Mullarx-Musik/Björk';
//$path = '/srv/nextcloud/klemens/files/Mullarx-Musik';
//$path = '/srv/nextcloud/klemens/files';

/*
hexdump('u?', false);
hexdump(hex2bin('75cc88'), false);
die;
*/

// FIX DIRS
$cmd = 'find ' . $path  . ' -type d' ;
$result = trim(shell_exec($cmd));
$files = explode("\n", $result);
foreach ($files as $file) {
    fix($file, $dryRun);
}

// FIX FILES
$cmd = 'find ' . $path  . ' -type f' ;
$result = trim(shell_exec($cmd));
$files = explode("\n", $result);

// ISO-8859 umlaut ü test
//array_unshift($files, 'immer brav ' . hex2bin('fc') . 'ben!');

foreach ($files as $file) {
    fix($file, $dryRun);
}

echo "\n\n\n\n";



// FUNCTION
function fix($file, $dryRun) {

    // Special handling for directories
    $isDir = false;
    if (is_dir($file)) {
        $filename = $file;
        $isDir = true;
    } else {    
        $filename = basename($file);
    }

    $fixed = $filename;
    $tests = array();


    // GARBLED CHAR BEFORE CHARSET DETECTION
    // charset detection does not work for garbled chars.

    $garbledBefore = array(
        hex2bin('c281') => 'ü',
        hex2bin('c284') => 'ä',
        hex2bin('c294') => 'ö',
        hex2bin('c3a1') => 'ß',
    );        
    foreach ($garbledBefore as $garble => $replacement) {
        if (strstr($fixed, $garble)) {
            $fixed = str_replace($garble, $replacement, $fixed);
            $tests['garbled_before_charset_conversion'] = "Replaced garble char '$garble' (" . bin2hex($garble) . ") into '$replacement' (". bin2hex($replacement) . ")\n";
        }
    }


    // CHARSET CONVERSION

    // uchardet is very unreliable, eg reports utf8 "ü" as WINDOWS-1258
    // deactivated at the moment.
    /*
    $cmd = 'echo "' . $fixed . '" | uchardet';
    $charset = trim(shell_exec($cmd));
    if (!in_array($charset, array('ASCII', 'UTF-8'))) {
        $fixed = iconv($charset, 'UTF-8', $fixed);
        $tests['charset'] = 'Detected charset by uchardet: ' . $charset . " ";
        $tests['charset'] .= 'Converted to UTF-8: ' . $fixed . "\n";        
    }
    */


    // with "file" command, and return only the first word (cut)
    // exclude "magic" file types like "core file (Xenix)" for "core_network.gif"
    $cmd = 'echo "' . $fixed . '" | file -b -e soft - | cut -f 1 -d " "';
    //$cmd = 'echo "' . $fixed . '" | file -b - ';
    $charset = trim(shell_exec($cmd));
    $okCharsets = array(
        'ASCII',
        'UTF-8',
    );
    if (!in_array($charset, $okCharsets)) {
        // fix "file" command charset names
        $charset = str_replace('ISO-8859', 'ISO-8859-15', $charset);
        
        // catch illegal "file" charsets and allow debugging
        try {
            $fixed = iconv($charset, 'UTF-8', $fixed);
        } catch (Exception $e) {
            var_dump($file);
            var_dump($fixed);
            var_dump('Wrong charset found. "file" output: ' . shell_exec('echo "' . $fixed . '" | file -b -e soft -'));
            throw $e;
        }

        $tests['charset'] = 'Detected charset by "file" command: ' . $charset . " ";
        $tests['charset'] .= 'Converted to UTF-8: ' . $fixed . "\n";    
    }


    // TEST FOR TREMAS
    $tremas = array(
        hex2bin('61cc88') => 'ä',
        hex2bin('6Fcc88') => 'ö',
        hex2bin('75cc88') => 'ü',
        hex2bin('41cc88') => 'Ä',
        hex2bin('4Fcc88') => 'Ö',
        hex2bin('55cc88') => 'Ü',
    );
    foreach ($tremas as $trema => $umlaut) {
        if (strstr($filename, $trema)) {
            $tests['trema'] = 'Trema detected: ' . bin2hex($trema) . '(' . $umlaut . ")\n";
        }
    }


    // 'INVALID ENCODING' STRING IN FILENAME
    if (strstr($filename, ' (ungültige Kodierung)')) {
        $tests['invalid_encoding_string'] = 'Invalid string detected: "(ungültige Kodierung)"'. "\n";
    }


    // GARBLED CHARS
    $garbled = array(
        '”' => 'ö',
        '„' => 'ä',
        '™' => 'Ö',
        '´' => '',
        '‚' => '', // not a normal comma!
        '?' => 'ö',
        '' => 'ü',
        'á' => 'ß',
        '?' => 'ä',
        '?' => 'ö',
        
        ' (ungültige Kodierung)' => '',
    ) + $tremas;

    foreach ($garbled as $garble => $replacement) {
        if (strstr($fixed, $garble)) {
            $fixed = str_replace($garble, $replacement, $fixed);
            $tests['garbled'] = "Replaced garble char '$garble' (" . bin2hex($garble) . ") into '$replacement' (". bin2hex($replacement) . ")\n";
        }
    }

    // Skip ok files
    if ($tests) {

        if ($isDir) {
            echo "\n############## DIR ##################";
        }        

        echo "\n>>>>> " . $file . "\n";

        // HEX
        echo "[hex_filename] \n" . hexdump($filename, false, false, true);

        foreach ($tests as $type => $text) {
            echo "[$type] \t" . $text;
        }

        //hexdump($filename, false);

        // NOW RENAME THE FILES
        $cmd = 'mv ' . escapeshellargMB($file) . ' ' . escapeshellargMB(str_replace($filename, $fixed, $file));
        echo $cmd . "\n";
        if ($dryRun) {
            echo "Dry run! No Change made.\n";
        } else {
            echo shell_exec($cmd) . "\n";
        }

        echo "\n";
    } else {
        // render dots for progress;
        echo ".";
    }

}





// Do not remove multibyte chars like escapeshellarg ()
function escapeshellargMB($string) {
    $string = str_replace("'", "'\\''", $string);

    return "'$string'";
}


/**
 * View any string as a hexdump.
 *
 * This is most commonly used to view binary data from streams
 * or sockets while debugging, but can be used to view any string
 * with non-viewable characters.
 *
 * @version     1.3.2
 * @author      Aidan Lister <aidan@php.net>
 * @author      Peter Waller <iridum@php.net>
 * @link        http://aidanlister.com/2004/04/viewing-binary-data-as-a-hexdump-in-php/
 * @param       string  $data        The string to be dumped
 * @param       bool    $htmloutput  Set to false for non-HTML output
 * @param       bool    $uppercase   Set to true for uppercase hex
 * @param       bool    $return      Set to true to return the dump
 */
function hexdump ($data, $htmloutput = true, $uppercase = false, $return = false)
{
    // Init
    $hexi   = '';
    $ascii  = '';
    $dump   = ($htmloutput === true) ? '<pre>' : '';
    $offset = 0;
    $len    = strlen($data);
 
    // Upper or lower case hexadecimal
    $x = ($uppercase === false) ? 'x' : 'X';
 
    // Iterate string
    for ($i = $j = 0; $i < $len; $i++)
    {
        // Convert to hexidecimal
        $hexi .= sprintf("%02$x ", ord($data[$i]));
 
        // Replace non-viewable bytes with '.'
        if (ord($data[$i]) >= 32) {
            $ascii .= ($htmloutput === true) ?
                            htmlentities($data[$i]) :
                            $data[$i];
        } else {
            $ascii .= '.';
        }
 
        // Add extra column spacing
        if ($j === 7) {
            $hexi  .= ' ';
            $ascii .= ' ';
        }
 
        // Add row
        if (++$j === 16 || $i === $len - 1) {
            // Join the hexi / ascii output
            $dump .= sprintf("%04$x  %-49s  %s", $offset, $hexi, $ascii);
            
            // Reset vars
            $hexi   = $ascii = '';
            $offset += 16;
            $j      = 0;
            
            // Add newline            
            if ($i !== $len - 1) {
                $dump .= "\n";
            }
        }
    }
 
    // Finish dump
    $dump .= $htmloutput === true ?
                '</pre>' :
                '';
    $dump .= "\n";
 
    // Output method
    if ($return === false) {
        echo $dump;
    } else {
        return $dump;
    }
}

function errorHandler($errNo, $errStr, $errFile, $errLine) {
    $msg = "$errStr in $errFile on line $errLine";
    if ($errNo == E_NOTICE || $errNo == E_WARNING) {
        throw new ErrorException($msg, $errNo);
    } else {
        echo $msg;
    }
}