Converting XML files to CSV in Java
@Before, there may be some repeated questions and suggestions. I don't think this is the first time. Please be as brief as possible The title gives the basic idea
The following is an XML example (case 1):
<root> <Item> <ItemID>4504216603</ItemID> <ListingDetails> <StartTime>10:00:10.000Z</StartTime> <EndTime>10:00:30.000Z</EndTime> <ViewItemURL>http://url</ViewItemURL> .... </item>
Here is a sample XML (case 2):
<Item> <ItemID>4504216604</ItemID> <ListingDetails> <StartTime>10:30:10.000Z</StartTime> <!-- Start difference from case 1 --> <averages> <AverageTime>value1</AverageTime> <category type="TX">9823</category> <category type="TY">9112</category> <AveragePrice>value2</AveragePrice> </averages> <!-- End difference from case 1 --> <EndTime>11:00:10.000Z</EndTime> <ViewItemURL>http://url</ViewItemURL> .... </item> </root>
I borrowed this XML from Google. Anyway, my objects are not always the same. Sometimes there are additional elements, such as case2 Now I want to generate such a CSV from these two cases:
ItemID,StartTime,EndTime,ViewItemURL,AverageTime,AveragePrice 4504216603,10:00:10.000Z,10:00:30.000Z,http://url 4504216604,10:30:10.000Z,11:00:10.000Z,http://url,value1,value2
The first line is the title, which should also be included in CSV I got some useful links today. I really don't know what is the right / best method. I'm working hard for 3 days now. I'm not really willing to give up
Tell me how you think you will solve this problem
I forgot to mention that this is a very huge XML file to 1GB
Boundary update:
I am looking for more general methods, which means that this should apply to any number of nodes with any depth. Sometimes in the example XML, it may happen that a project object has more nodes than the next / previous node (all columns and values match in CSV format)
It is also possible that nodes have the same name / localname but different values and attributes. If so, the new column should be displayed in CSV with appropriate values (I added this example to the category in the < average > tab)
Solution
The code provided should be considered a sketch rather than a final article I am not an expert in Sax, and I can improve the implementation for better performance, simpler code, etc This means that Sax should be able to handle streaming large XML files
I will use the Sax parser to receive two pass questions (incidentally, I will also use the CSV generation library to create the output, because it will process all the involved CSV files, but I didn't implement it in the sketch)
First pass: create the number of Title columns
Second pass: output CSV
I think the XML file format is good I assume we don't have a scheme / DTD with a predefined order
For the first time, I assume that a CSV column will be added for each XML element that contains text content or any attribute (I assume that the attribute will contain something!)
After establishing the number of target columns, the second pass will execute the actual CSV output
Based on your sample XML, my code sketch will produce:
ItemID,category,type,4504216604,9823,9112,TX,TY,value2
Note that I've used the Google collection linkedhashmultimap because it's useful when associating multiple values with a single key I hope you find it useful!
import com.google.common.collect.LinkedHashMultimap; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.IOException; import java.util.LinkedHashMap; import java.util.Map.Entry; import org.xml.sax.Attributes; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.XMLReader; import org.xml.sax.helpers.DefaultHandler; import org.xml.sax.helpers.XMLReaderFactory; public class App { public static void main(String[] args) throws SAXException,FileNotFoundException,IOException { // First pass - to determine headers XMLReader xr = XMLReaderFactory.createXMLReader(); HeaderHandler handler = new HeaderHandler(); xr.setContentHandler(handler); xr.setErrorHandler(handler); FileReader r = new FileReader("test1.xml"); xr.parse(new InputSource(r)); LinkedHashMap<String,Integer> headers = handler.getHeaders(); int totalnumberofcolumns = 0; for (int headercount : headers.values()) { totalnumberofcolumns += headercount; } String[] columnheaders = new String[totalnumberofcolumns]; int i = 0; for (Entry<String,Integer> entry : headers.entrySet()) { for (int j = 0; j < entry.getValue(); j++) { columnheaders[i] = entry.getKey(); i++; } } StringBuilder sb = new StringBuilder(); for (String h : columnheaders) { sb.append(h); sb.append(','); } System.out.println(sb.substring(0,sb.length() - 1)); // Second pass - collect and output data xr = XMLReaderFactory.createXMLReader(); DataHandler datahandler = new DataHandler(); datahandler.setHeaderArray(columnheaders); xr.setContentHandler(datahandler); xr.setErrorHandler(datahandler); r = new FileReader("test1.xml"); xr.parse(new InputSource(r)); } public static class HeaderHandler extends DefaultHandler { private String content; private String currentElement; private boolean insideElement = false; private Attributes attribs; private LinkedHashMap<String,Integer> itemHeader; private LinkedHashMap<String,Integer> accumulativeHeader = new LinkedHashMap<String,Integer>(); public HeaderHandler() { super(); } private LinkedHashMap<String,Integer> getHeaders() { return accumulativeHeader; } private void addItemHeader(String headerName) { if (itemHeader.containsKey(headerName)) { itemHeader.put(headerName,itemHeader.get(headerName) + 1); } else { itemHeader.put(headerName,1); } } @Override public void startElement(String uri,String name,String qName,Attributes atts) { if ("item".equalsIgnoreCase(qName)) { itemHeader = new LinkedHashMap<String,Integer>(); } currentElement = qName; content = null; insideElement = true; attribs = atts; } @Override public void endElement(String uri,String qName) { if (!"item".equalsIgnoreCase(qName) && !"root".equalsIgnoreCase(qName)) { if (content != null && qName.equals(currentElement) && content.trim().length() > 0) { addItemHeader(qName); } if (attribs != null) { int attsLength = attribs.getLength(); if (attsLength > 0) { for (int i = 0; i < attsLength; i++) { String attName = attribs.getLocalName(i); addItemHeader(attName); } } } } if ("item".equalsIgnoreCase(qName)) { for (Entry<String,Integer> entry : itemHeader.entrySet()) { String headerName = entry.getKey(); Integer count = entry.getValue(); //System.out.println(entry.getKey() + ":" + entry.getValue()); if (accumulativeHeader.containsKey(headerName)) { if (count > accumulativeHeader.get(headerName)) { accumulativeHeader.put(headerName,count); } } else { accumulativeHeader.put(headerName,count); } } } insideElement = false; currentElement = null; attribs = null; } @Override public void characters(char ch[],int start,int length) { if (insideElement) { content = new String(ch,start,length); } } } public static class DataHandler extends DefaultHandler { private String content; private String currentElement; private boolean insideElement = false; private Attributes attribs; private LinkedHashMultimap dataMap; private String[] headerArray; public DataHandler() { super(); } @Override public void startElement(String uri,Attributes atts) { if ("item".equalsIgnoreCase(qName)) { dataMap = LinkedHashMultimap.create(); } currentElement = qName; content = null; insideElement = true; attribs = atts; } @Override public void endElement(String uri,String qName) { if (!"item".equalsIgnoreCase(qName) && !"root".equalsIgnoreCase(qName)) { if (content != null && qName.equals(currentElement) && content.trim().length() > 0) { dataMap.put(qName,content); } if (attribs != null) { int attsLength = attribs.getLength(); if (attsLength > 0) { for (int i = 0; i < attsLength; i++) { String attName = attribs.getLocalName(i); dataMap.put(attName,attribs.getValue(i)); } } } } if ("item".equalsIgnoreCase(qName)) { String data[] = new String[headerArray.length]; int i = 0; for (String h : headerArray) { if (dataMap.containsKey(h)) { Object[] values = dataMap.get(h).toArray(); data[i] = (String) values[0]; if (values.length > 1) { dataMap.removeAll(h); for (int j = 1; j < values.length; j++) { dataMap.put(h,values[j]); } } else { dataMap.removeAll(h); } } else { data[i] = ""; } i++; } StringBuilder sb = new StringBuilder(); for (String d : data) { sb.append(d); sb.append(','); } System.out.println(sb.substring(0,sb.length() - 1)); } insideElement = false; currentElement = null; attribs = null; } @Override public void characters(char ch[],length); } } public void setHeaderArray(String[] headerArray) { this.headerArray = headerArray; } } }