I've written a small console application (source below) to locate and optionally rename files containing international characters, as they are a source of constant pain with most source control systems (some background on this below). The code I'm using has a simple dictionary with characters to look for and replace (and nukes every other character that uses more than one byte of storage), but it feels very hackish. What's the right way to (a) find out whether a character is international? and (b) what the best ASCII substitution character would be?
Let me provide some background information on why this is needed. It so happens that the danish Å character has two different encodings in UTF-8, both representing the same symbol. These are known as NFC and NFD encodings. Windows and Linux will create NFC encoding by default but respect whatever encoding it is given. Mac will convert all names (when saving to a HFS+ partition) to NFD and therefore returns a different byte stream for the name of a file created on Windows. This effectively breaks Subversion, Git and lots of other utilities that don't care to properly handle this scenario.
I'm currently evaluating Mercurial, which turns out to be even worse at handling international characters.. being fairly tired of these problems, either source control or the international character would have to go, and so here we are.
My current implementation:
public class Checker
{
private Dictionary<char, string> internationals = new Dictionary<char, string>();
private List<char> keep = new List<char>();
private List<char> seen = new List<char>();
public Checker()
{
internationals.Add( 'æ', "ae" );
internationals.Add( 'ø', "oe" );
internationals.Add( 'å', "aa" );
internationals.Add( 'Æ', "Ae" );
internationals.Add( 'Ø', "Oe" );
internationals.Add( 'Å', "Aa" );
internationals.Add( 'ö', "o" );
internationals.Add( 'ü', "u" );
internationals.Add( 'ä', "a" );
internationals.Add( 'é', "e" );
internationals.Add( 'è', "e" );
internationals.Add( 'ê', "e" );
internationals.Add( '¦', "" );
internationals.Add( 'Ã', "" );
internationals.Add( '©', "" );
internationals.Add( ' ', "" );
internationals.Add( '§', "" );
internationals.Add( '¡', "" );
internationals.Add( '³', "" );
internationals.Add( '', "" );
internationals.Add( 'º', "" );
internationals.Add( '«', "-" );
internationals.Add( '»', "-" );
internationals.Add( '´', "'" );
internationals.Add( '`', "'" );
internationals.Add( '"', "'" );
internationals.Add( Encoding.UTF8.GetString( new byte[] { 226, 128, 147 } )[ 0 ], "-" );
internationals.Add( Encoding.UTF8.GetString( new byte[] { 226, 128, 148 } )[ 0 ], "-" );
internationals.Add( Encoding.UTF8.GetString( new byte[] { 226, 128, 153 } )[ 0 ], "'" );
internationals.Add( Encoding.UTF8.GetString( new byte[] { 226, 128, 166 } )[ 0 ], "." );
keep.Add( '-' );
keep.Add( '=' );
keep.Add( '\'' );
keep.Add( '.' );
}
public bool IsInternationalCharacter( char c )
{
var s = c.ToString();
byte[] bytes = Encoding.UTF8.GetBytes( s );
if( bytes.Length > 1 && ! internationals.ContainsKey( c ) && ! seen.Contains( c ) )
{
Console.WriteLine( "X '{0}' ({1})", c, string.Join( ",", bytes ) );
seen.Add( c );
if( ! keep.Contains( c ) )
{
internationals[ c ] = "";
}
}
return internationals.ContainsKey( c );
}
public bool HasInternationalCharactersInName( string name, out string safeName )
{
StringBuilder sb = new StringBuilder();
Array.ForEach( name.ToCharArray(), c => sb.Append( IsInternationalCharacter( c ) ? internationals[ c ] : c.ToString() ) );
int length = sb.Length;
sb.Replace( " ", " " );
while( sb.Length != length )
{
sb.Replace( " ", " " );
}
safeName = sb.ToString().Trim();
string namePart = Path.GetFileNameWithoutExtension( safeName );
if( namePart.EndsWith( "." ) )
safeName = namePart.Substring( 0, namePart.Length - 1 ) + Path.GetExtension( safeName );
return name != safeName;
}
}
And this would be invoked like this:
FileInfo file = new File( "Århus.txt" );
string safeName;
if( checker.HasInternationalCharactersInName( file.Name, out safeName ) )
{
// rename file
}