/*
** Copyright (C) 2001,2002 Sacha Faust <sacha@severus.org>
**
** This program is free software; you can redistribute it and/or modify
** it under the terms of the GNU General Public License as published by
** the Free Software Foundation; either version 2 of the License, or
** (at your option) any later version.
**
** This program is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
** GNU General Public License for more details.
**
** You should have received a copy of the GNU General Public License
** along with this program; if not, write to the Free Software
** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/

/*
 *  Version : 1.3
 */
package faust.sacha.web.bot.spider.data;

import faust.sacha.web.data.URLData;
import faust.sacha.web.data.URLCont;
import faust.sacha.web.util.URLInfo;
import faust.sacha.web.util.WebGlobal;

import org.apache.commons.httpclient.methods.HeadMethod;
import org.apache.commons.httpclient.HttpMethod;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.HttpException;

import faust.sacha.web.bot.spider.event.ThreadEvent;
import faust.sacha.web.bot.spider.event.ThreadEventManager;
import faust.sacha.web.bot.spider.event.ProcessBroker;
import faust.sacha.web.bot.spider.event.ThreadManager;
import faust.sacha.web.bot.spider.util.EnginGlobal;

import java.io.IOException;

public class FileInfoGetter extends FileGetter {

    public FileInfoGetter(URLData url, ProcessBroker broker) {
        super("FileInfoGetter->" + url.getURL(), url, broker);
    }
//------------------------------------------------------------------------------
    public void run(){
        super.run();
        ThreadEvent event = null;
        ThreadEvent eventRemove = null;
        
        if( getState() == ThreadEvent.EVENT_STOP ){
            /*
             *  tell our QueueManager that we are done
             */        
            eventRemove = new ThreadEvent(this, null, ThreadEvent.EVENT_FINISH);
            sendEventAll(eventRemove);
            return;
        }
        
        if( getFileInfo( m_url ) ){
            if( WebGlobal.DEBUG )
                System.err.println( getName() + " : adding TO_GET_DATA : " + m_url.getURL() );

            /*
             *  Tell the process broker that he needs to send this url
             *  to the GetDataManager
             */
            event = new ThreadEvent(this, (Object)m_url, ThreadEvent.EVENT_TO_GET_DATA);
        }
        else{
            if( WebGlobal.DEBUG )
                System.err.println( getName() + " : adding TO_SET_INFO : " + m_url.getURL() );
            
            /*
             *  Tell the process broker that he needs to set this file info
             *  in the site
             */
            event = new ThreadEvent(this, (Object)m_url, ThreadEvent.EVENT_TO_SET_INFO);
        }
        
        /*
         *  tell our QueueManager that we are done
         */        
        if( WebGlobal.DEBUG )
            System.err.println( getName() + " : sending eventRemove" );        
        eventRemove = new ThreadEvent(this, null, ThreadEvent.EVENT_FINISH);
        sendEventAll(eventRemove);
        
        if( WebGlobal.DEBUG )
            System.err.println( getName() + " : sending event to m_broker" );
        //  send the result to the process broker
        sendEvent(event, (ThreadEventManager)m_broker);
     
        /*
         *  Remove ourselves from the ThreadManager
         */        
        EnginGlobal.THREAD_MANAGER.removeThread();
        
        
        if( WebGlobal.DEBUG )
            System.err.println( getName() + "::run() : Exiting run()" );        
    }
//------------------------------------------------------------------------------
    private boolean getFileInfo( URLData url ){
        URLCont urlData = null;
        HeadMethod requestMethod = null;
        int requestCode;
        boolean allGood = true;
        int nbTries = 0;

        urlData = new URLCont( (URLInfo)url );
        requestMethod = new HeadMethod();
        prepareRequestMethod( url, requestMethod, WebGlobal.FOLLOW_INSIDE_REDIRECT );

        // get rid of expired cookies
        urlData.purgeCookies();
          
        if( WebGlobal.DEBUG )
            System.err.println( getName() + "::getFileInfo() : getting : " + url.getURL() );
        
        while( true ){
            try{
                requestCode = requestMethod.execute(urlData.getState(), urlData);

                // Checking return code
                switch( requestCode ){
                    case HttpStatus.SC_MOVED_TEMPORARILY:       // 302 Moved Temporarily (Sometimes Found) (HTTP/1.0 - RFC 1945)
                        if( WebGlobal.DEBUG )
                            System.err.println( getName() + "::getFileInfo() : got redirect on : " + url.getURL() );
                        
                        //set the request Information
                        url.setResponseInfo( requestMethod.getResponseHeaders() );
                        
                        processRedirect(url);
                        allGood = false;
                        break;
                        
                    // ERRORS
                    case HttpStatus.SC_BAD_REQUEST:             //  400 Bad Request (HTTP/1.1 - RFC 2616)
                    case HttpStatus.SC_NOT_FOUND:               //  404 Not Found (HTTP/1.0 - RFC 1945)

                        /*
                         *  If the url is a query, we try to fetch the data anyways
                         *  since we don't set allGood to false, it will we send to getFileData()
                         */
                        if( url.getQuery() == "" )
                            allGood = false;

                        break;
                    case HttpStatus.SC_GONE:                    //  410 Gone (HTTP/1.1 - RFC 2616)
                        allGood = false;
                        break;
                    case HttpStatus.SC_NOT_ACCEPTABLE:          //  406 Not Acceptable (HTTP/1.1 - RFC 2616)
                        if( !isPotentialHTMLFile(url) )
                            allGood = false;
                        else{
                            if( WebGlobal.DEBUG )
                                System.err.println( getName() + " : potential html for file : " + url.getFile() + " for url : " + url.getURL() );
                        }
                        break;
                        
                    // handling timeouts
                    case HttpStatus.SC_REQUEST_TIMEOUT:         //  408 Request Timeout (HTTP/1.1 - RFC 2616)
                        if( nbTries == WebGlobal.getTimeoutNbRetries() ){
                            /*
                             * There is no point to continue. We return immediatly
                             */
                            return false;
                        }
                        else{
                            nbTries++;
                            try{
                                sleep(100);
                            }
                            catch( InterruptedException intEx ){
                                if( WebGlobal.DEBUG )
                                System.err.println( getName() + "::getFileInfo() : " + intEx );
                            }
                            continue;
                        }                      
                        
                    // Interesting errors
                    case HttpStatus.SC_INTERNAL_SERVER_ERROR:   //  500 Server Error (HTTP/1.0 - RFC 1945)
                    case HttpStatus.SC_SERVICE_UNAVAILABLE:     //  503 Service Unavailable (HTTP/1.0 - RFC 1945)
                        System.err.println( getName() + "::getFileInfo() : got Server Error " + requestCode + " on : " + url.getURL() );
                        allGood = false;
                        break;

                    // section of error codes that indicate that the page is present but we can't see them
                    case HttpStatus.SC_UNAUTHORIZED:            //  401 Unauthorized (HTTP/1.0 - RFC 1945)
                        /*
                         *  We add the url here since we won't be able to fetch it later
                         */
                        allGood = false;
                        break;

                    // Section of error codes that I want to see if I get them
                    case HttpStatus.SC_PAYMENT_REQUIRED:        //  402 Payment Required (HTTP/1.1 - RFC 2616) : This code is reserved for future use.           
                    case HttpStatus.SC_FORBIDDEN:               //  403 Forbidden (HTTP/1.0 - RFC 1945)
                    case HttpStatus.SC_METHOD_NOT_ALLOWED:      //  405 Method Not Allowed (HTTP/1.1 - RFC 2616)
                    case HttpStatus.SC_CONFLICT:                //  409 Conflict (HTTP/1.1 - RFC 2616)
                    case HttpStatus.SC_LENGTH_REQUIRED:         //  411 Length Required (HTTP/1.1 - RFC 2616)
                    case HttpStatus.SC_PRECONDITION_FAILED:     //  412 Precondition Failed (HTTP/1.1 - RFC 2616)
                    case HttpStatus.SC_REQUEST_TOO_LONG:        //  413 Request Entity Too Large (HTTP/1.1 - RFC 2616)
                    case HttpStatus.SC_REQUEST_URI_TOO_LONG:    //  414 Request-URI Too Long (HTTP/1.1 - RFC 2616)
                    case HttpStatus.SC_UNSUPPORTED_MEDIA_TYPE:  //  415 Unsupported Media Type (HTTP/1.1 - RFC 2616)
                    case HttpStatus.SC_REQUESTED_RANGE_NOT_SATISFIABLE:     //  416 Requested Range Not Satisfiable (HTTP/1.1 - RFC 2616)
                    case HttpStatus.SC_EXPECTATION_FAILED:      //  417 Expectation Failed (HTTP/1.1 - RFC 2616)
                    case HttpStatus.SC_INSUFFICIENT_SPACE_ON_RESOURCE:      //  419 Insufficient Space on Resource (WebDAV - draft-ietf-webdav-protocol-05?) or 419 Proxy Reauthentication Required (HTTP/1.1 drafts?)
                    case HttpStatus.SC_METHOD_FAILURE:          //  420 Method Failure (WebDAV - draft-ietf-webdav-protocol-05?)
                    case HttpStatus.SC_UNPROCESSABLE_ENTITY:    //  422 Unprocessable Entity (WebDAV - RFC 2518)
                    case HttpStatus.SC_LOCKED:                  //  423 Locked (WebDAV - RFC 2518)
                    case HttpStatus.SC_FAILED_DEPENDENCY:       //  424 Failed Dependency (WebDAV - RFC 2518)
                    case HttpStatus.SC_NOT_IMPLEMENTED:         //  501 Not Implemented (HTTP/1.0 - RFC 1945)
                    case HttpStatus.SC_HTTP_VERSION_NOT_SUPPORTED:      //  505 HTTP Version Not Supported (HTTP/1.1 - RFC 2616)
                    case HttpStatus.SC_INSUFFICIENT_STORAGE:    //  507 Insufficient Storage (WebDAV - RFC 2518)
                        System.err.println( getName() + "::getFileInfo() : got un usual error " + requestCode + " on : " + url.getURL() );
                        allGood = false;
                        break;

                    // section indicating proxy error
                    case HttpStatus.SC_PROXY_AUTHENTICATION_REQUIRED:   // 407 Proxy Authentication Required (HTTP/1.1 - RFC 2616)
                    case HttpStatus.SC_BAD_GATEWAY:             //  502 Bad Gateway (HTTP/1.0 - RFC 1945)
                    case HttpStatus.SC_GATEWAY_TIMEOUT:         //  504 Gateway Timeout (HTTP/1.1 - RFC 2616)
                        System.err.println( getName() + "::getFileInfo() : got proxy error " + requestCode + " on : " + url.getURL() );
                        allGood = false;
                        break;
                }
            }
            catch( HttpException httpEx ){
                System.err.println( getName() + "::getFileInfo() : HttpException : " + httpEx );
                allGood = false;
            }
            catch( IOException ioExRequest ){
                if( WebGlobal.DEBUG )
                    System.err.println( getName() + "::getFileInfo() : IOException:open() : " + nbTries + " : " + ioExRequest );
                    
                if( nbTries == WebGlobal.getTimeoutNbRetries() ){
                    /*
                     * There is no point to continue. We return immediatly
                     */
                    return false;
                }
                else{
                    nbTries++;
                    try{
                        sleep(100);
                    }
                    catch( InterruptedException intEx ){
                        if( WebGlobal.DEBUG )
                            System.err.println( getName() + "::getFileInfo() : " + intEx );
                    }
                    continue;
                }
            }
            
            break;
        }

        //set the request Information
        url.setResponseInfo( requestMethod.getResponseHeaders() );

        if( url.getContentTypeInfo().toLowerCase().indexOf("text/html") == -1 ){
            
            if( WebGlobal.DEBUG )
                System.err.println( getName() + "::getFileInfo() : Content-Type : " + url.getContentTypeInfo() + " on url : " + url.getURL() );
                    
                /*
                 *  If the url is a query, we try to fetch the data anyways
                 *  since we don't set allGood to false, it will we send to getFileData()
                 */
                if( (url.getQuery() == "") || (url.getQuery() == null) )
                    allGood = false;
        }

        // check for content-location header
        if( url.getHeaderContent("content-location", 0) != null ){
            processContentLocation(url);
            allGood = false;
        }
        
        try{
            urlData.close();
        }
        catch( IOException closeIoEx ){
            System.err.println( getName() + "::getFileInfo() : IOException:close() " + closeIoEx );
        }
        
        return allGood;
    }    
//------------------------------------------------------------------------------

}
