package com.directoryfiles;
import java.sql.SQLOutput;
import java.util.Scanner;
import org.apache.tika.Tika;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.image.ImageMetadataExtracto r;
import org.apache.tika.parser.jpeg.JpegParser;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.helpers.DefaultHandler;
import org.apache.commons.io.*;
import java.io.*;
/**
* Created by IntelliJ IDEA.
* User: tr01
* Date: 12/27/13
* Time: 10:13 AM
* To change this template use File | Settings | File Templates.
*/
public class ExtractText {
ContentHandler textHandler;
Metadata metadata;
AutoDetectParser parser;
File fileName;
String resourceLocation;
JpegParser jpegParserObj;
ParseContext context;
InputStream input;
Tika tikaObj;
ImageMetadataExtractor imgObj;
void parseDoc(String resourceLocation) {
this.resourceLocation=resourceLocation;
try{
fileName = new File(resourceLocation);
tikaObj = new Tika();
InputStream input = new FileInputStream(fileName);
//input=this.getClass().getResourceAsStream(resource Location);
if(fileName.getName().endsWith(".zip")){
textHandler = new BodyContentHandler((int)fileName.length() * 10);
}
else{
textHandler = new BodyContentHandler((int)fileName.length()+1);
}
metadata = new Metadata();
// metadata.set(Metadata.CONTENT_TYPE, tikaObj.detect(input));
if(fileName.getName().endsWith(".jpeg") || fileName.getName().endsWith(".png") || fileName.getName().endsWith(".jpg")){
// jpegParserObj = new JpegParser();
//jpegParserObj.parse(input,textHandler,metadata,con text);
}else{
parser = new AutoDetectParser();
parser.parse(input, textHandler, metadata);
}
input.close();
// displayMetaDataContent();
createAllFiles();
}
catch (IOException io)
{
io.printStackTrace();
}
catch (Exception e)
{
e.printStackTrace();
}
}
public void imageExtract(String fileName)
{
try{
tikaObj = new Tika();
input = new FileInputStream(new File(fileName));
Metadata metadata = new Metadata();
ContentHandler handler = new DefaultHandler();
Parser parser = new JpegParser();
ParseContext context = new ParseContext();
String mimeType = tikaObj.detect(input);
metadata.set(Metadata.CONTENT_TYPE, mimeType);
imgObj = new ImageMetadataExtractor(metadata);
imgObj.parseJpeg(new File(fileName));
// displayMetaDataContent();
}
catch (IOException io)
{
io.printStackTrace();
}
catch (Exception e){e.printStackTrace();}
}
public void displayMetaDataContent()
{
System.out.println("Tika Parser starts……\n");
System.out.println("file name: "+resourceLocation);
System.out.println("Title:" + metadata.get("title"));
System.out.println("Author: " + metadata.get("Author"));
System.out.println("content: " + textHandler.toString());
System.out.println("Tika Parser stops……");
}
public void createAllFiles()
{
try
{
//FileUtils.writeStringToFile(new File(fileName.getPath()+".txt"),textHandler.toStri ng());
}
catch (Exception io)
{
io.printStackTrace();
}
}
}