Java – use the long string in JSON (> 1 show) of Jackson token stream
I'm trying to write some code to process JSON documents, which contain extremely long string values (more than 1 billion characters) stored in files I don't want to keep the whole string in memory (because I can process them in the stream) But I can't find such an option in the Jackson parser What I have done so far is to use Jackson token offset (the first round of reading files) and random access files to process strings in the stream (the second round of reading files):
import java.io.ByteArrayOutputStream; import java.io.File; import java.io.OutputStream; import java.io.PrintWriter; import java.io.RandomAccessFile; import java.nio.charset.Charset; import java.util.HashMap; import java.util.Map; import com.fasterxml.jackson.core.JsonFactory; import com.fasterxml.jackson.core.JsonParser; import com.fasterxml.jackson.core.JsonToken; import com.fasterxml.jackson.databind.MappingJsonFactory; public class LongStringJsonTest { public static void main(String[] args) throws Exception { File tempJson = new File("temp.json"); PrintWriter pw = new PrintWriter(tempJson); pw.print("{\"k1\": {\"k11\": \""); for (int i = 0; i < 1e8; i++) pw.print("abcdefghij"); pw.print("\"},\"k2\": \"klmnopqrst\"," + "\"k3\": [\"uvwxyz\",\"0123\"]}"); pw.close(); searchForStrings(tempJson); } private static void searchForStrings(File tempJson) throws Exception { JsonFactory f = new MappingJsonFactory(); JsonParser jp = f.createParser(tempJson); Map<Long,Long> stringStartToNext = new HashMap<Long,Long>(); long lastStringStart = -1; boolean wasFieldBeforeString = false; while (true) { JsonToken token = jp.nextToken(); if (token == null) break; if (lastStringStart >= 0) { stringStartToNext.put(lastStringStart,(wasFieldBeforeString ? -1 : 1) * jp.getTokenLocation().getByteOffset()); lastStringStart = -1; wasFieldBeforeString = false; } if (token == JsonToken.FIELD_NAME) { wasFieldBeforeString = true; } else if (token == JsonToken.VALUE_STRING) { lastStringStart = jp.getTokenLocation().getByteOffset(); } else { wasFieldBeforeString = false; } } jp.close(); jp = f.createParser(tempJson); RandomAccessFile raf = new RandomAccessFile(tempJson,"r"); while (true) { JsonToken token = jp.nextToken(); if (token == null) break; if (token == JsonToken.VALUE_STRING) { long start = jp.getTokenLocation().getByteOffset(); long end = stringStartToNext.get(start); // You are able to process stream without keeping all bytes in memory. // Here you see strings including quotes around them. final long[] length = new long[] {0}; ByteArrayOutputStream baos = new ByteArrayOutputStream(); OutputStream os = new OutputStream() { @Override public void write(int b) throws IOException { throw new IOException("Method is not supported"); } @Override public void write(byte[] b,int off,int len) throws IOException { if (baos.size() < 20) { baos.write(b,off,Math.min(len,20)); baos.write((int)'.'); baos.write((int)'.'); baos.write((int)'.'); } if (len > 0) length[0] += len; } }; processString(raf,start,end,os); String text = new String(baos.toByteArray(),Charset.forName("utf-8")); System.out.println("String: " + text + ",length=" + length[0]); } } jp.close(); raf.close(); } private static void processString(RandomAccessFile raf,long start,long end,OutputStream os) throws Exception { boolean wasFieldBeforeString = end < 0; int quoteNum = wasFieldBeforeString ? 3 : 1; end = Math.abs(end); byte[] buffer = new byte[10000]; raf.seek(start); boolean afterBackSlash = false; int strLen = (int)(end - start); for (int chunk = 0; strLen > 0; chunk++) { int ret = raf.read(buffer,Math.min(buffer.length,strLen)); if (ret < 0) break; if (ret > 0) { int offset = 0; if (chunk == 0) { // Assumption that key string doesn't contain double quotes // and it's shorter than buffer size (for simplicity) for (int n = 0; n < quoteNum; n++) { while (true) { if (buffer[offset] == '\"' && !afterBackSlash) { break; } else if (buffer[offset] == '\\') { afterBackSlash = !afterBackSlash; } else { afterBackSlash = false; } offset++; } offset++; } offset--; ret -= offset; } // Searching for ending quote int endQuotePos = offset + (chunk == 0 ? 1 : 0); // Skip open quote while (endQuotePos < offset + ret) { if (buffer[endQuotePos] == '\"' && !afterBackSlash) { break; } else if (buffer[endQuotePos] == '\\') { afterBackSlash = !afterBackSlash; } else { afterBackSlash = false; } endQuotePos++; } if (endQuotePos < offset + ret) { os.write(buffer,offset,endQuotePos + 1 - offset); break; } os.write(buffer,ret); strLen -= ret; } } } }
This method does not support Unicode at all I wonder if there is any way to do better (or even with the help of some other LIBS)?
Solution
I think you're asking the wrong question
JSON, such as XML or CSV, or any other structured text representation, has three main roles: making data structures artificially parseable, allowing common tools to process many different types of data, and promoting data exchange models between different internal systems that may be used
If you do not need these specific features, structured text may be the wrong solution Dedicated binary representations can be more efficient, and the difference can become significant as the size / complexity of the data increases
Support structured text format for import and export tools Internally, however, you might want to use a data model tailored to specific task needs