import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import java.util.*;
//Example :
public class ParseFile
{
void parseDoc(final String resourceLocation) throws IOException,
SAXException, TikaException {
InputStream input = new FileInputStream(new File(resourceLocation));
ContentHandler textHandler = new BodyContentHandler();
Metadata metadata = new Metadata();
AutoDetectParser parser = new AutoDetectParser();
parser.parse(input, textHandler, metadata);
input.close();
out.println(“Tika Parser starts……\n”);
out.println(“file name: “+resourceLocation);
out.println(“Title: ” + metadata.get(“title”));
out.println(“Author: ” + metadata.get(“Author”));
out.println(“content: ” + textHandler.toString());
out.println(“Tika Parser stops……”);
}
}
public class AutoParse
{
public static void main(String[] args)
{
ParseFile pf=new ParseFile();
Scanner sc=new Scanner();
final String resourceLocation=sc.readLine();
pf.parseDoc(resourceLocation);
}
}
/*using this example we can parse any compatible file by calling parseDoc() providing file name in parameter (with location )
In the above example, I first create a FileInputStream containing the document to parse. Then I use a Tika content handler called BodyContentHandler that internally construct s content handler decorator of type XHTML to TextContextHandler . The decorator is actually forming the plain text output from the SAX event that the Parser emits. Next I instantiate a AutoDetectParser directly, call the parse method and close the stream. It is required to call close method of InputStream since it is not the responsibility of Parser to call it for user.
Tika provides some readymade ContentHandler implementations that can be useful while parsing content with Tika.
Finally, the metadata (input/output) parameter provides additional data to the parser as input and can return additional metadata out from the document. Examples of metadata include things like author name, number of pages, creation date, etc.*/
/* for Parsing image file
ImageParser parser = new ImageParser();
for parsing PDF
PDFParser parser = new PDFParser();
*/
/*AutoDetectParser parser = newAutoDetectParser();
Some use of Tika API
Apache Tika- A Content Extraction Framework | amitbariar
*/