Hello, I wrote a program to extract table data from HTML document.
However I want to shrink this code more because it has to use large array variables.
Could you give me any advice for improving this code more?
import java.util.regex.Matcher; import java.util.regex.Pattern; // This program copies all of table data // in a HTML document into the array of String type. class Tblextrct { enum STATE {cnct, dntct}; public Tblextrct(){ } public String [][] tableAnalysis(String ss, boolean debug){ Pattern trs = Pattern.compile("^<tr"); Matcher trsm; Pattern tre = Pattern.compile("^</tr>"); Matcher trem; Pattern tds = Pattern.compile("^<td"); Matcher tdsm; Pattern tde = Pattern.compile("^</td>"); Matcher tdem; Pattern tags = Pattern.compile("^<(table|/table|tr|/tr|td|/td)"); Matcher tagm ; Pattern ntag = Pattern.compile("^[^<]"); Matcher ntagm ; Pattern spc = Pattern.compile("^(\\s+)$"); Matcher spcm ; int i = 0, j = 0, x = 100*1000, y = 50, xm = 0, ym = 0 ; String [][] array = new String [x][y]; String [] stk = new String[x]; String [] stk1 = new String[x]; String cc = "", dd = ""; ss = ss.replace("<b>", ""); ss = ss.replace("</b>", ""); for ( i = 0 ; i < ss.length();i++){ cc = ss.substring(i,i+1); if ( cc.equals("<")){ stk[j] = dd; j++; dd = cc; } else if ( cc.equals(">")){ stk[j] = dd + cc; j++; dd = ""; } else { dd = dd + cc; } } j = 0 ; for ( i = 0 ; i < stk.length ; i++){ if ( stk[i] != null ){ tagm = tags.matcher(stk[i]); ntagm = ntag.matcher(stk[i]); if ( (ntagm.find() || tagm.find()) && !stk[i].equals("") ){ stk1[j]=stk[i]; j++; } } } j = 0 ; x = 0; y = 0; for ( i = 0 ; i < stk1.length ; i++){ if ( stk1[i] != null ){ trsm = trs.matcher(stk1[i]); trem = tre.matcher(stk1[i]); tdsm = tds.matcher(stk1[i]); tdem = tde.matcher(stk1[i]); ntagm = ntag.matcher(stk1[i]); spcm = spc.matcher(stk1[i]); if ( ntagm.find() && !spcm.find() ){ if (array[x][y] != null){ array[x][y]= array[x][y] + " " + stk1[i]; if (debug){ System.out.println(">0 : "+x+","+y+" "+array[x][y]); } } else { array[x][y] = stk1[i]; if (debug){ System.out.println("<0 : "+x+","+y+" "+stk1[i]); } } } else if ( trsm.find() ){ x++; y = 0; } else if ( trem.find()){ if ( xm < x ){ xm = x; } } else if ( tdsm.find()){ y++; } else if ( tdem.find()){ if ( ym < y ){ ym = y; } } } } xm++; ym++; if (debug){ System.out.println("xm="+xm); System.out.println("ym="+ym); } String [][] ary = new String [xm][ym]; for ( i = 0 ; i < xm ; i++){ for ( j = 0 ; j < ym ; j++ ){ if ( array[i][j] == null ){ ary[i][j] = ""; }else{ ary[i][j] = array[i][j]; } } } return ary; } }
class Test { public static void main(String args[]) { int x = 0 , y = 0; String [][] ss ; String sa = ""; Tblextrct tt = new Tblextrct(); // a input HTML description. String tb ="<table border=\"5\" cellspacing=\"15\" cellpadding=\"10\">\n<tr> <td>Apple</td> <td>Sweet-sour</td> <td>Not quite red</td> </tr> <tr> <td>Chinese citron</td> <td>Quite sour</td> <td>Almost yellow</td> </tr></table>"; ss = tt.tableAnalysis(tb, false); for ( x = 0 ; x < (ss.length);x++){ sa = ""; for ( y = 0 ; y < (ss[x].length); y++){ if ( ss[x][y] != null){ sa = sa + "," + ss[x][y]; } } System.out.println("ss["+x+"]={"+sa+"}"); } } }