View Single Post
  #6   Spotlight this post!  
Unread 14-03-2011, 13:03
buchanan buchanan is online now
Registered User
FRC #2077 (Laser Robotics)
Team Role: Mentor
 
Join Date: Mar 2009
Rookie Year: 2007
Location: Wales, WI
Posts: 67
buchanan is just really nicebuchanan is just really nicebuchanan is just really nicebuchanan is just really nice
Re: Reading characters from socket wierdness

In Java a char is a 16-bit value assumed to mean a unicode character or "code point". This is an internal representation, and any time you do I/O through encoding-aware classes (Input/OutputStreamReader/Writer) it gets converted to or from an external representation. This can be either explicitly specified or taken from the platform's default. If the reader doesn't use the same encoding as the writer, mismatches occur and the reader doesn't get out the same internal "char" values the writer put in. Both UTF-8 and forms of LATIN-1/ISO-8859-1 are in common use as defaults, so relying on defaults is dangerous when passing data between dissimilar machines. What's insidious is that these two encodings, though strictly speaking incompatible, actually do map 0-127 the same way, so programs only passing code points in this range appear to work, even if they're mismatched.

Below is some code you can play with to observe the various interactions, but the takeaways are 1) Don't use encoding-aware APIs unless what you're passing really is text data, and 2) If you are passing encoded text between different platforms, specify the encoding explicitly.
Code:
/////////////////////////
import java.net.*;
import java.io.*;
import java.nio.charset.*;

public class Reader
{
	// reads a character sequence presumed to be in the platform's default character encoding
	public static void main(String[] argv)
	{
		try {
			System.out.println(Charset.defaultCharset());
			// run w/ -Dfile.encoding=UTF-8 or -Dfile.encoding=ISO-8859-1 on the command line to change the above
	
			ServerSocket ss = new ServerSocket(0);
			System.out.println(ss.getLocalPort());
			Socket s = ss.accept();
			System.out.println(s.getPort());

			BufferedReader reader = new BufferedReader(new InputStreamReader(s.getInputStream())); // uses Charset.defaultCharset()
			//BufferedReader reader = new BufferedReader(new InputStreamReader(s.getInputStream(), "ISO-8859-1")); // explicitly specifies encoding
			char[] data = reader.readLine().toCharArray(); // convert the incoming encoded sequence assuming it's in "our" encoding
			// if our encoding matched the writer's (whatever it was) all is well
			// if there's a mismatch, we get various kinds of garbage, depending on who used what
			// for UTF-8/ISO-8859-1 mismatches, the garbage only shows up in code points > 127, since their encodings happen to match for 0-127
			for(int i = 0; i < data.length; i++) {
				System.out.print((int)data[i] + " ");
			}
			System.out.println();
		}
		catch (Exception ex) {
			ex.printStackTrace();
		}
	}
}
/////////////////////////
public class Writer
{
	// writes a character sequence in the platform's default character encoding
	public static void main(String[] argv) // supply the Reader port number in argv[0]
	{
		try {
			System.out.println(Charset.defaultCharset());
			// run w/ -Dfile.encoding=UTF-8 or -Dfile.encoding=ISO-8859-1 on the command line to change the above
		
			Socket s = new Socket(InetAddress.getLocalHost(), Integer.parseInt(argv[0]));
			System.out.println(s.getLocalPort());

			BufferedWriter out = new BufferedWriter(new OutputStreamWriter(s.getOutputStream())); // uses Charset.defaultCharset()
			//BufferedWriter out = new BufferedWriter(new OutputStreamWriter(s.getOutputStream(), "ISO-8859-1")); // explicitly specifies encoding
			char[] data = {1, 2, 3, 100, 150, 180}; // a "char" is a 16-bit unicode "code point"
			out.write(data); // the OutputStreamWriter encodes the chars in its charset
			// under UTF-8, the last line writes 1 2 3 100 194 150 194 180 (4 8-bit values and 2 16-bit)
			// under ISO-8859-1 it's 1 2 3 100 150 180 (all 8-bit values)
			out.write("\n"); // writes a 10 (in either encoding)
			out.flush();
		}
		catch (Exception ex) {
			ex.printStackTrace();
		}
	}
}
/////////////////////////
public class RawReader
{
	// reads a stream of bytes; nothing here is affected by the JVM's default encoding
	public static void main(String[] argv)
	{
		try {
			System.out.println(Charset.defaultCharset());
		
			ServerSocket ss = new ServerSocket(0);
			System.out.println(ss.getLocalPort());
			Socket s = ss.accept();
			System.out.println(s.getPort()); // pass to Writer in argv[0]

			InputStream in = s.getInputStream();
			for (int i = in.read(); i != -1; i = in.read()) {
				System.out.print(i + " ");
			}
			System.out.println();
		}
		catch (Exception ex) {
			ex.printStackTrace();
		}
	}
}
/////////////////////////
public class RawWriter
{
	// writes a stream of bytes; nothing here is affected by the JVM's default encoding
	public static void main(String[] argv) // supply the Reader port number in argv[0]
	{
		try {
			System.out.println(Charset.defaultCharset());
		
			Socket s = new Socket(InetAddress.getLocalHost(), Integer.parseInt(argv[0]));
			System.out.println(s.getLocalPort());

			OutputStream out = s.getOutputStream();
			byte[] data = {(byte)1, (byte)2, (byte)3, (byte)100, (byte)150, (byte)180}; // bytes are 0-255 integers
			out.write(data); // no encoding happens here
			//out.write((byte)10); // if we add the EOL (10) here we can duplicate the output of -Dfile.encoding=ISO-8859-1 Writer
			out.close();
		}
		catch (Exception ex) {
			ex.printStackTrace();
		}
	}
}
/////////////////////////
Reply With Quote