using System; using System.IO; using System.Text; using System.Text.RegularExpressions; using System.Collections; namespace DF { class FileEncodingAutoDetection { private static readonly Regex coding_pep_re = new Regex(@"^(?:#|//).*coding[:=]\s*([-\.\w]+)", RegexOptions.Compiled | RegexOptions.IgnoreCase); private static readonly Regex coding_pep_cp_re = new Regex(@"^(?:cp-?)?(\d+)$", RegexOptions.Compiled | RegexOptions.IgnoreCase); private static readonly char[] line_delimeters = new char[] {'\r', '\n'}; // save runtime from allocating it over and over again public static Encoding DetectEncoding(FileStream fs, IList lines, int maxlines, Encoding defaultEncoding) { if(fs == null) throw new ArgumentNullException("fs"); if(lines == null) maxlines = 0; if(maxlines < 0) throw new ArgumentOutOfRangeException("maxlines", "maxlines must be greater than zero"); if(defaultEncoding == null) defaultEncoding = Encoding.Default; Encoding ascii = Encoding.ASCII; long saved_position = fs.Position; int len = 0; int utf8len = 0; int foundlines = 0; char ch1, ch2, ch = '\0'; bool have_valid_utf8 = true; byte[] bytes = new byte[4096]; StringBuilder sb = null; if(maxlines > 0) sb = new StringBuilder(); while((len = fs.Read(bytes, 0, bytes.Length)) != 0) { // scan for first needed lines if(foundlines < maxlines) { string chars = new string(ascii.GetChars(bytes, 0, len)); int linestart = 0; int lineend; while(foundlines < maxlines && linestart < chars.Length && (lineend = chars.IndexOfAny(line_delimeters)) != -1) { if(ch != '\0' && lineend == 0) { ch2 = chars[lineend]; if((ch == '\r' && ch2 == '\n') || (ch == '\n' && ch2 == '\r')) { ch = '\0'; linestart = 1; continue; } } if(linestart < lineend) sb.Append(chars, linestart, lineend - linestart); lines.Add(sb.ToString()); sb.Length = 0; ++foundlines; if(lineend == (chars.Length-1)) { ch = chars[lineend]; linestart = lineend + 1; break; } ch = '\0'; ch1 = chars[lineend]; ch2 = chars[lineend+1]; if((ch1 == '\r' && ch2 == '\n') || (ch1 == '\n' && ch2 == '\r')) linestart = lineend + 2; else linestart = lineend + 1; } if(foundlines < maxlines && linestart < chars.Length) sb.Append(chars, linestart, chars.Length - linestart); } // determine if utf-8 is correct for(int i = 0; have_valid_utf8 && i < len; ++i) { byte b = bytes[i]; if(b >= 0x80) { if(b < 0xc0) { if(utf8len > 0) utf8len--; else have_valid_utf8 = false; } else if(b >= 0xc2 && b < 0xf5) { if(utf8len > 0) have_valid_utf8 = false; else if(b < 0xe0) utf8len = 1; else if(b < 0xf0) utf8len = 2; else utf8len = 3; } else { have_valid_utf8 = false; } } else if(utf8len > 0) have_valid_utf8 = false; } if(!have_valid_utf8) { if(foundlines < maxlines) continue; break; } } fs.Position = saved_position; if(foundlines < maxlines) lines.Add(sb.ToString()); if(have_valid_utf8) return Encoding.UTF8; return defaultEncoding; } public static StreamReader OpenText(string fname, Encoding defaultEncoding) { if(fname == null) throw new ArgumentNullException("fname"); FileStream fs = File.OpenRead(fname); try { Encoding enc = Encoding.ASCII; StreamReader reader = new StreamReader(fs, enc); reader.Peek(); // force BOM detection if(reader.CurrentEncoding != enc) return reader; // BOM detected reader.DiscardBufferedData(); reader = null; fs.Position = 0; ArrayList lines = new ArrayList(); enc = DetectEncoding(fs, lines, 2, defaultEncoding); foreach(string line in lines) { Match m1 = coding_pep_re.Match(line); if(!m1.Success) continue; // support for ascii and mbcs encodings if(m1.Groups[1].Value == "ascii") { enc = Encoding.ASCII; break; } else if(m1.Groups[1].Value == "mbcs") { enc = Encoding.Default; break; } try { enc = Encoding.GetEncoding(m1.Groups[1].Value); break; } catch (Exception) { } Match m2 = coding_pep_cp_re.Match(m1.Groups[1].Value); if(m2.Success) { try { enc = Encoding.GetEncoding(int.Parse(m1.Groups[1].Value)); break; } catch (Exception) { } } // throw new EncodingUnsupportedException(m1.Groups[1].Value); } reader = new StreamReader(fs, enc, false); return reader; } catch (Exception) { fs.Close(); throw; } } public static StreamReader OpenText(string fname) { return OpenText(fname, null); } } class TestCase { public static void Main() { Console.WriteLine("Привет вам из си-шарпа!"); using(StreamReader reader = FileEncodingAutoDetection.OpenText("FileEncodingAutoDetection2.cs")) { string line; while((line = reader.ReadLine()) != null) Console.WriteLine(line); } } } }