/*
** Copyright (C) 2001,2002 Sacha Faust <sacha@severus.org>
**
** This program is free software; you can redistribute it and/or modify
** it under the terms of the GNU General Public License as published by
** the Free Software Foundation; either version 2 of the License, or
** (at your option) any later version.
**
** This program is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
** GNU General Public License for more details.
**
** You should have received a copy of the GNU General Public License
** along with this program; if not, write to the Free Software
** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/

/*
 *  Version : 1.3
 */
package faust.sacha.web.bot.spider.data;

import faust.sacha.web.data.URLData;
import faust.sacha.web.data.URLCont;
import faust.sacha.web.util.URLInfo;
import faust.sacha.web.util.WebGlobal;
import faust.sacha.web.util.HTMLFile;
import faust.sacha.web.util.ParserGetter;

import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.HttpMethod;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.HttpException;

import faust.sacha.web.bot.spider.event.ThreadEvent;
import faust.sacha.web.bot.spider.event.ThreadEventManager;
import faust.sacha.web.bot.spider.event.ProcessBroker;
import faust.sacha.web.bot.spider.event.ThreadManager;
import faust.sacha.web.bot.spider.util.EnginGlobal;
import faust.sacha.web.bot.spider.data.FileData;

import java.io.*;

import javax.swing.text.*;
import javax.swing.text.html.*;
import javax.swing.text.html.parser.*;

public class FileDataGetter extends FileGetter {
    
    private HTMLFile m_fileInfo;
    
    public FileDataGetter( URLData url, ProcessBroker broker ) {
        super("FileDataGetter->" + url.getURL(), url, broker);
        m_fileInfo = new HTMLFile();
    }
//------------------------------------------------------------------------------
    public void run(){
        super.run();
        
        FileData fileData = null;
        ThreadEvent event = null;
        ThreadEvent eventSetInfo = null;
        ThreadEvent eventRemove = null;
        
        if( getState() == ThreadEvent.EVENT_STOP ){
            /*
             *  tell our QueueManager that we are done
             */
            eventRemove = new ThreadEvent(this, null, ThreadEvent.EVENT_FINISH);
            sendEventAll(eventRemove);
            return;
        }
        
        if( getFileData(m_url) ){
            fileData = new FileData(m_fileInfo, m_url);
            
            /*
             *  Tell the process broker that he needs to process
             *  some links from fileData
             */
            event = new ThreadEvent(this, (Object)fileData, ThreadEvent.EVENT_TO_PROCESS_LINKS);
            sendEvent(event, m_broker);
        }
        
        
        /*
         *  Tell the process broker to ass the url info to the site
         */
        eventSetInfo = new ThreadEvent(this, (Object)m_url, ThreadEvent.EVENT_TO_SET_INFO);
        sendEvent(eventSetInfo, m_broker);
        
        /*
         *  tell our QueueManager that we are done
         */
        eventRemove = new ThreadEvent(this, null, ThreadEvent.EVENT_FINISH);
        sendEventAll(eventRemove);
        
        /*
         *  Remove ourselves from the ThreadManager
         */
        EnginGlobal.THREAD_MANAGER.removeThread();
        
        if( WebGlobal.DEBUG )
            System.err.println( getName() + "::run() : Exiting run()" );
    }
//------------------------------------------------------------------------------
    protected boolean getFileData( URLData url ){
        
        URLCont urlData = null;
        HTMLEditorKit.Parser parser = null;
        HTMLEditorKit.ParserCallback callback = null;
        GetMethod requestMethod = null;
        boolean allGood = true;
        int requestCode;
        URLInfo tmpURL = null;
        String urlLocation = null;
        int nbTries = 0;
        
        if( WebGlobal.DEBUG )
            System.err.println( getName() + "::getFileData() : getting url : " + url.getURL());
        
        try{
            ParserGetter kit = new ParserGetter();
            parser = kit.getParser();
            callback = m_fileInfo;
            
            urlData = new URLCont( (URLInfo)url );
            requestMethod = new GetMethod();
            prepareRequestMethod( url, (HttpMethod)requestMethod, WebGlobal.FOLLOW_INSIDE_REDIRECT );
            
            /*
             *  Set the max download.
             *  You need your patch the httpclient package with my diff
             *  to get this to work. Please refer to the README.TXT document
             */
            ((GetMethod)requestMethod).setMaxBodySize(WebGlobal.MAX_REQUEST_DOWNLOAD);
            
            // get rid of expired cookies
            urlData.purgeCookies();

            /*
             *  We retry to open the connection up to WebGlobal.getTimeoutNbRetries()
             *
             *  Logic:
             *      We retry openning the connection until the nbTries reach WebGlobal.getTimeoutNbRetries()
             *      We only retry to open the connection when a IOException occurs.
             */
            while( true ){
                try{
                    requestCode = requestMethod.execute(urlData.getState(), urlData);

                    // Checking return code
                    switch( requestCode ){
                        
                        case HttpStatus.SC_MOVED_TEMPORARILY:       // 302 Moved Temporarily (Sometimes Found) (HTTP/1.0 - RFC 1945)
                            if( WebGlobal.DEBUG )
                                System.err.println( getName() + "::getFileData() : got redirect on : " + url.getURL() );
                            
                            //set the request Information
                            url.setResponseInfo( requestMethod.getResponseHeaders() );                            
                            
                            processRedirect(url);
                            allGood = false;
                            break;
                            
                        // section of error codes that indicate that the page is present but we can't see them
                        case HttpStatus.SC_UNAUTHORIZED:            //  401 Unauthorized (HTTP/1.0 - RFC 1945)
                                /*
                                 *  We add the url here since we won't be able to fetch it later
                                 */
                            allGood = false;
                            break;

                        // handling timeouts
                        case HttpStatus.SC_REQUEST_TIMEOUT:         //  408 Request Timeout (HTTP/1.1 - RFC 2616)
                            if( nbTries == WebGlobal.getTimeoutNbRetries() ){
                                /*
                                 * There is no point to continue. We return immediatly
                                 */
                                return false;
                            }
                            else{
                                nbTries++;
                                try{
                                    sleep(100);
                                }
                                catch( InterruptedException intEx ){
                                    if( WebGlobal.DEBUG )
                                    System.err.println( getName() + "::getFileInfo() : " + intEx );
                                }
                                continue;
                            }                            
                        // ERRORS
                        case HttpStatus.SC_BAD_REQUEST:             //  400 Bad Request (HTTP/1.1 - RFC 2616)
                        case HttpStatus.SC_NOT_FOUND:               //  404 Not Found (HTTP/1.0 - RFC 1945)
                        case HttpStatus.SC_GONE:                    //  410 Gone (HTTP/1.1 - RFC 2616)
                        case HttpStatus.SC_NOT_ACCEPTABLE:          //  406 Not Acceptable (HTTP/1.1 - RFC 2616)

                        // Interesting errors
                        case HttpStatus.SC_INTERNAL_SERVER_ERROR:   //  500 Server Error (HTTP/1.0 - RFC 1945)
                        case HttpStatus.SC_SERVICE_UNAVAILABLE:     //  503 Service Unavailable (HTTP/1.0 - RFC 1945)

                        // Section of error codes that I want to see if I get them
                        case HttpStatus.SC_PAYMENT_REQUIRED:        //  402 Payment Required (HTTP/1.1 - RFC 2616) : This code is reserved for future use.
                        case HttpStatus.SC_FORBIDDEN:               //  403 Forbidden (HTTP/1.0 - RFC 1945)
                        case HttpStatus.SC_METHOD_NOT_ALLOWED:      //  405 Method Not Allowed (HTTP/1.1 - RFC 2616)
                        case HttpStatus.SC_CONFLICT:                //  409 Conflict (HTTP/1.1 - RFC 2616)
                        case HttpStatus.SC_LENGTH_REQUIRED:         //  411 Length Required (HTTP/1.1 - RFC 2616)
                        case HttpStatus.SC_PRECONDITION_FAILED:     //  412 Precondition Failed (HTTP/1.1 - RFC 2616)
                        case HttpStatus.SC_REQUEST_TOO_LONG:        //  413 Request Entity Too Large (HTTP/1.1 - RFC 2616)
                        case HttpStatus.SC_REQUEST_URI_TOO_LONG:    //  414 Request-URI Too Long (HTTP/1.1 - RFC 2616)
                        case HttpStatus.SC_UNSUPPORTED_MEDIA_TYPE:  //  415 Unsupported Media Type (HTTP/1.1 - RFC 2616)
                        case HttpStatus.SC_REQUESTED_RANGE_NOT_SATISFIABLE:     //  416 Requested Range Not Satisfiable (HTTP/1.1 - RFC 2616)
                        case HttpStatus.SC_EXPECTATION_FAILED:      //  417 Expectation Failed (HTTP/1.1 - RFC 2616)
                        case HttpStatus.SC_INSUFFICIENT_SPACE_ON_RESOURCE:      //  419 Insufficient Space on Resource (WebDAV - draft-ietf-webdav-protocol-05?) or 419 Proxy Reauthentication Required (HTTP/1.1 drafts?)
                        case HttpStatus.SC_METHOD_FAILURE:          //  420 Method Failure (WebDAV - draft-ietf-webdav-protocol-05?)
                        case HttpStatus.SC_UNPROCESSABLE_ENTITY:    //  422 Unprocessable Entity (WebDAV - RFC 2518)
                        case HttpStatus.SC_LOCKED:                  //  423 Locked (WebDAV - RFC 2518)
                        case HttpStatus.SC_FAILED_DEPENDENCY:       //  424 Failed Dependency (WebDAV - RFC 2518)
                        case HttpStatus.SC_NOT_IMPLEMENTED:         //  501 Not Implemented (HTTP/1.0 - RFC 1945)
                        case HttpStatus.SC_HTTP_VERSION_NOT_SUPPORTED:      //  505 HTTP Version Not Supported (HTTP/1.1 - RFC 2616)
                        case HttpStatus.SC_INSUFFICIENT_STORAGE:    //  507 Insufficient Storage (WebDAV - RFC 2518)

                        // section indicating proxy error
                        case HttpStatus.SC_PROXY_AUTHENTICATION_REQUIRED:   // 407 Proxy Authentication Required (HTTP/1.1 - RFC 2616)
                        case HttpStatus.SC_BAD_GATEWAY:             //  502 Bad Gateway (HTTP/1.0 - RFC 1945)
                        case HttpStatus.SC_GATEWAY_TIMEOUT:         //  504 Gateway Timeout (HTTP/1.1 - RFC 2616)
                            allGood = false;
                            break;
                    }
                }
                catch( HttpException httpEx ){
                    System.err.println( getName() + "::getFileInfo() : HttpException : " + httpEx );
                    allGood = false;
                }
                catch( IOException ioExRequest ){
                    
                    if( WebGlobal.DEBUG )
                        System.err.println( getName() + "::getFileData() : IOException:open() : " + nbTries + " : " + ioExRequest );
                    
                    if( nbTries == WebGlobal.getTimeoutNbRetries() ){
                        /*
                         * There is no point to continue. We return immediatly
                         */
                        return false;
                    }
                    else{
                        nbTries++;
                        try{
                            sleep(100);
                        }
                        catch( InterruptedException intEx ){
                            if( WebGlobal.DEBUG )
                                System.err.println( getName() + "::getFileData() : " + intEx );
                        }
                        continue;
                    }
                }
                
                /*
                 *  Everything is fine, we get out of the while(true) loop
                 */
                break;
            }

            if( allGood ){
                //set the request Information
                url.setResponseInfo( requestMethod.getResponseHeaders() );
                
                if( url.getContentTypeInfo().toLowerCase().indexOf("text/html") == -1 ){
                    System.err.println(getName() + "::getFileData() : Content-Type : " + url.getContentTypeInfo() + " on url : " + url.getURL() );
                    
                    allGood = false;
                }
                else{
                    // if everything is good, we pass the info to the handler
                    InputStream in = requestMethod.getResponseBodyAsStream();
                    InputStreamReader r = new InputStreamReader(in);
                    parser.parse(r, callback,true);
                }
            }
        }
        catch( Exception e ){
            System.err.println( getName() + "::getFileData() : exception : " + e );
            e.printStackTrace();
            allGood = false;
        }
        finally{
            try{
                urlData.close();
            }
            catch( IOException closeIoEx ){
                System.err.println(getName() + "::getFileData() : IOException:close() " + closeIoEx );
            }
        }
        
        // page 251 :Java Network Programming
        try{
            callback.flush();
        }
        catch( BadLocationException badLocE ){
            System.err.println(getName() + "::getFileData() : " + badLocE );
        }
        
        return allGood;
    }
//------------------------------------------------------------------------------
}
