I am trying to use Java to convert a .txt file to a .xml format
It successfully parses the text file to an xml file. The problem is I'm having issues getting all the tags I'd like.
This is a sample of what my input text file looks like:
<DOC> <DOCNO>3393</DOCNO> <TEXT> Biblical Traditions </TEXT> </DOC> <DOC> <DOCNO>42027</DOCNO> <TEXT> Automobiles </TEXT> </DOC> <DOC> <DOCNO>7456</DOCNO> <TEXT> Fruits and Vegetables </TEXT> </DOC>
When I am finished with my current code, the outputted parsed xml file looks like this:
<?xml version="1.0" encoding="UTF-8" standalone="no"?> <DOC> <DOC> <TEXT></TEXT> </DOC> <DOC> <TEXT/> </DOC> <DOC> <TEXT/> </DOC> <DOC> <TEXT> Biblical Traditions </TEXT> </DOC> <DOC> <TEXT/> </DOC> <DOC> <TEXT/> </DOC> <DOC> <TEXT/> </DOC> <DOC> <TEXT/> </DOC> <DOC> <TEXT/> </DOC> <DOC> <TEXT/> </DOC> <DOC> <TEXT/> </DOC> <DOC> <TEXT> Automobiles </TEXT> </DOC> <DOC> <TEXT/> </DOC> <DOC> <TEXT/> </DOC>
It's creating redundant <DOC> and <TEXT> tags. Here is my current code. I need help modifying it to include the <DOCNO> tag:
package convert; import java.io.BufferedReader; import java.io.FileReader; import javax.xml.*; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import javax.xml.transform.OutputKeys; import javax.xml.transform.Transformer; import javax.xml.transform.TransformerConfigurationException; import javax.xml.transform.TransformerException; import javax.xml.transform.TransformerFactory; import javax.xml.transform.dom.DOMSource; import javax.xml.transform.stream.StreamResult; import org.w3c.dom.*; public class convertToXML { BufferedReader in; StreamResult out; Document xmldoc; Element root; public static void main(String[] args) { new convertToXML().doit(); } public void doit() { try { in = new BufferedReader(new FileReader("C:\\text.txt")); out = new StreamResult("C:\\newXML.xml"); initXML(); String str; while ((str = in.readLine()) != null) { process(str); } in.close(); writeXML(); } catch (Exception e) { e.printStackTrace(); } } public void initXML() throws ParserConfigurationException { //JAXP + DOM DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); DocumentBuilder builder = factory.newDocumentBuilder(); DOMImplementation impl = builder.getDOMImplementation(); xmldoc = impl.createDocument(null, "DOC", null); root = xmldoc.getDocumentElement(); } public void process(String s) { //Escape character String [] elements = s.split("\\<"); Element e0 = xmldoc.createElement("DOC"); Element e1 = xmldoc.createElement("TEXT"); Node n1 = xmldoc.createTextNode(elements[0]); e1.appendChild(n1); //Element e2 = xmldoc.createElement("TEXT"); //Node n2 = xmldoc.createTextNode(elements[1]); //e2.appendChild(n2); e0.appendChild(e1); //e0.appendChild(e2); root.appendChild(e0); } public String writeXML() throws TransformerConfigurationException, TransformerException { DOMSource domSource = new DOMSource(xmldoc); TransformerFactory tf = TransformerFactory.newInstance(); Transformer transformer = tf.newTransformer(); transformer.setOutputProperty(OutputKeys.METHOD, "xml"); transformer.setOutputProperty(OutputKeys.ENCODING, "UTF-8"); transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "4"); transformer.setOutputProperty(OutputKeys.INDENT, "yes"); transformer.transform(domSource, out); java.io.StringWriter sw = new java.io.StringWriter(); StreamResult sr = new StreamResult(sw); transformer.transform(domSource, sr); return sw.toString(); } }