/*
** Copyright (C) 2001,2002 Sacha Faust <sacha@severus.org>
**
** This program is free software; you can redistribute it and/or modify
** it under the terms of the GNU General Public License as published by
** the Free Software Foundation; either version 2 of the License, or
** (at your option) any later version.
**
** This program is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
** GNU General Public License for more details.
**
** You should have received a copy of the GNU General Public License
** along with this program; if not, write to the Free Software
** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/

/*
 *  Version : 1.3
 */
package faust.sacha.web.bot.spider.data;

import faust.sacha.web.bot.spider.*;
import faust.sacha.web.bot.spider.data.*;
import faust.sacha.web.bot.spider.util.*;
import faust.sacha.web.bot.spider.event.*;

import faust.sacha.web.util.*;
import faust.sacha.web.data.*;

import java.util.*;
import java.net.MalformedURLException;
import java.text.ParseException;
import java.util.regex.*;


public class SiteURLProcessor extends ThreadEventManager {
    
    private UrlQInt m_todoQueue;
    private Site m_site;
    private boolean m_processing;
    
    public SiteURLProcessor( String name, Site site, UrlQInt toProcess ) {
        super(name);
        m_todoQueue = toProcess;
        m_site = site;
        m_processing = false;
    }
//------------------------------------------------------------------------------
    public void run(){
        super.run();
    
        process();
        
        if( WebGlobal.DEBUG )
            System.err.println( getName() + "::run() : Exiting run()" );        
    }
//------------------------------------------------------------------------------
    private synchronized void process(){
        int state;
        
        while( true ){
            state = getState();
            
            if( (state == ThreadEvent.EVENT_STOP) || (state == ThreadEvent.EVENT_SITE_FINISH) )
                break;
            
            /*
             *  We release the lock and wait to be notified
             */
            try{
                wait();
            }
            catch( InterruptedException intEx ){
                if( WebGlobal.DEBUG )
                    System.err.println( getName() + "::run() : " + intEx );
            }
        }
    }
//------------------------------------------------------------------------------
    public void receiveEvent( ThreadEvent event ){
        Object result = null;
        Object source = null;
        int eventType = 0;
        
        setProcessing(true);
        result = event.getResult();
        source = event.getSource();
        eventType = event.getEventType();
        
        switch( eventType ){
            case ThreadEvent.EVENT_TO_PROCESS_LINKS:
                if( WebGlobal.DEBUG )
                    System.err.println(getName() + "::receiveEvent : EVENT_TO_PROCESS_LINKS  from : " + source.getClass().getName() );
                
                if( result == null ){
                    System.err.println("CODE ERROR->" + getName() + "::receiveEvent : got null result on event : EVEN_TO_PROCESS_LINS from : " + source.getClass().getName() );
                    
                    break;
                }
                else if( !(result instanceof FileData) ){
                    System.err.println("CODE ERROR->" + "::receiveEvent : event type : EVENT_TO_PROCESS_LINKS : data type of: " + result.getClass().getName() );
                    
                    break;
                }
                else{
                    processFileInfo( (FileData)result );
                    break;
                }
            case ThreadEvent.EVENT_TO_SET_INFO:
                if( WebGlobal.DEBUG )
                    System.err.println(getName() + "::receiveEvent : event type : EVENT_TO_SET_INFO: data type of: " + result.getClass().getName() + "from : " + source.getClass().getName() );
                
                if( result == null ){
                    System.err.println("CODE ERROR->" + getName() + "::receiveEvent : got null result on event : EVENT_TO_SET_INFO from : " + source.getClass().getName() );
                    break;
                }
                else if( !(result instanceof URLData) ){
                    System.err.println("CODE ERROR->" + getName() + "::receiveEvent : event type : EVENT_TO_SET_INFO: data type of: " + result.getClass().getName() + " from : " + source.getClass().getName() );
                    
                    break;                    
                }
                else{
                    setInfoToSite( (URLData)result );
                    break;
                }
            case ThreadEvent.EVENT_SITE_FINISH:
                if( WebGlobal.DEBUG )
                    System.err.println(getName() + "::receiveEvent : event type : EVENT_SITE_FINISH : from : " + source.getClass().getName() );
                
                if( result != null )
                    System.err.println("CODE ERROR->" + getName() + "::receiveEvent : event type : EVENT_SITE_FINISH : data is not NULL and is type of: " + result.getClass().getName() );
                
                setState(eventType);
                break;
            default:
                //pass the message to the parent
                super.receiveEvent(event);
                break;
        }
        
        setProcessing(false);
    }
//------------------------------------------------------------------------------    
    protected void processFileInfo( FileData fileInfo ){
        
        // setting URLData info
        
        // adding links to the file
        fileInfo.getParentURL().addLinkList( fileInfo.getFileData().getURLList() );
        
        processFileURLList( fileInfo );
        processEmailList( fileInfo );
        processCommentList( fileInfo );
        
        // setting the site info.
        setInfoToSite(fileInfo.getParentURL());
    }     
//------------------------------------------------------------------------------
    private void processFileURLList( FileData fileInfo ){
        ArrayList urlList = null;
        //boolean stillGoOn = true;   // for illegalCaracters check
        String strURL = null;
        URLData newURLInfo = null;
        URLInfo urlBaseInfo = null;
        URLInfo wouldBeSameAs = null;
        URLInfo url = null;
        SimpleJavaScriptURLGetter javaScriptCallHandler = null;
        String tmpStr = null;
        Pattern protocolPattern = Pattern.compile("^([a-zA-Z0-9]{1,5}://)");    // find protocol pattern at the beginning of the script
        Matcher patternMatcher = null;
        String protocolName = null;
        
        /*
         * This fix some of the bug I've see when the code tries to fetch
         * java script.
         */
        //String illegalCaracters[] = { "(", ")", "{", "}" };
        
        urlList = fileInfo.getFileData().getURLList();
        url = (URLInfo)fileInfo.getParentURL();
        
        try{
            urlBaseInfo = new URLInfo( url.getBaseURL() );
        }
        catch( MalformedURLException urlEx ){
            // This is a serious problem if we get this here
            System.err.println( getName() + "::processFileURLList : " + urlEx );
        }
        
        for( int i = 0; i < urlList.size(); i++ ){
            strURL = (String)urlList.get(i);
            
            /*
             * Checking for the presence of javascript() in the url
             */
            if( strURL.toLowerCase().indexOf("javascript") != -1 ){
                if( WebGlobal.DEBUG )
                    System.err.println(getName() + "::processFileURLList() : found javascript in url : " + strURL );
                
                try{
                    javaScriptCallHandler = new SimpleJavaScriptURLGetter(strURL);
                }
                catch( ParseException parseEx ){
                    continue;
                }
                tmpStr = javaScriptCallHandler.getURLRef();
                if( (tmpStr == null) || tmpStr.equals("") )
                    continue;
                
                if( tmpStr.toLowerCase().startsWith("mailto:") ){
                    addEmailToList(tmpStr, fileInfo.getParentURL());
                    continue;
                }
                
                patternMatcher = protocolPattern.matcher(tmpStr);
                if( patternMatcher.find() ){
                    /*
                     *  Get the protocol name
                     */
                    protocolName = patternMatcher.group(1).toLowerCase();
                    
                    if( !protocolName.startsWith("http://") && !protocolName.startsWith("https://") ){
                        // misc protocol
                        continue;
                    }
                    
                }
                
                // Set the strURL and continue to process it
                strURL = tmpStr;
            }
                        
            /*
             *  Checking for creation of the same url as the parent url
             *
             *  Logic:
             *  Check if the combinaison the strURL and url.getBaseURL() would create the same
             *  link referenced to the current url. If it would, we simply skip it.
             *  This goes on for all the strURL that will be added to the parent URL.
             *  This is accomplish by the code using the variable wouldBeSameAs.
             */
             
            if( strURL.startsWith("_top") || strURL.startsWith("#") )       //getting rid of the reference to same doc
                continue;
            else if( strURL.startsWith("http://") || strURL.startsWith("https://") ){   // full URL
                
                /*
                 *  Checking if the url referencing itself
                 */
                if( strURL.compareToIgnoreCase(url.getURL()) == 0 ){
                    if( WebGlobal.DEBUG )
                        System.err.println(getName() + "::processFileURLList() : strURL : " + strURL + " same as url : " + url.getURL() );
                    
                    continue;
                }
                try{
                    newURLInfo = new URLData(strURL);
                }
                catch( MalformedURLException urlEx ){
                    System.err.println(getName() + "::processFileURLList() : protocol " + urlEx );
                    continue;
                }
            }
            else if( strURL.lastIndexOf("/") == -1 ){           // file in same folder : Example : ulref = http://www.test.com/index.html  strURL = index2.html
                try{
                    wouldBeSameAs = new URLData( strURL, urlBaseInfo );
                }
                catch( MalformedURLException urlEx ){
                    System.err.println(getName() + "::processFileURLList() : protocol " + urlEx );
                    continue;
                }
                
                if( wouldBeSameAs.sameAs(url) ){
                    if( WebGlobal.DEBUG )
                        System.err.println(getName() + "::processFileURLList() : new url " + strURL + " with main url base " + urlBaseInfo.getURL() + " would be same as " + url.getURL() );
                    
                    continue;
                }
                
                try{
                    newURLInfo = new URLData(strURL, url);
                }
                catch( MalformedURLException urlEx ){
                    System.err.println(getName() + "::processFileURLList()-1 : " + urlEx );
                    continue;
                }                
                if( WebGlobal.DEBUG )
                    System.err.println(getName() + "::processFileURLList-1 : relative url : urlRef : " + url.getURL() + " | strURL = " + strURL + " | newURL : " + newURLInfo.getURL() );
               
            }
            else if( (!strURL.startsWith("/")) && (strURL.lastIndexOf("/") != -1) ){    // folder/file or folder/file/ combo : the folder link is not preceded by a /
                try{
                    wouldBeSameAs = new URLInfo( strURL, urlBaseInfo );
                }
                catch( MalformedURLException urlEx ){
                    System.err.println(getName() + "::processFileURLList()-2 :" + urlEx );
                    continue;
                }
                
                if( wouldBeSameAs.sameAs(url) ){
                    if( WebGlobal.DEBUG )
                        System.err.println(getName() + "::processFileURLList() : new url " + strURL + " with main url base " + urlBaseInfo.getURL() + " would be same as " + url.getURL() );
                    
                    continue;
                }
                
                try{
                    newURLInfo = new URLData(strURL, url);
                }
                catch( MalformedURLException urlEx ){
                    System.err.println(getName() + "::processFileURLList()-2 :" + urlEx );
                    continue;
                }
                
                if( WebGlobal.DEBUG )
                    System.err.println(getName() + "::processFileURLList-2 : relative url : urlRef : " + url.getURL() + " | strURL = " + strURL + " | newURL : " + newURLInfo.getURL() );
               
            }
            else if( strURL.startsWith("/") ){  // Example : urlref = http://www.test.com strURL = /testing123

                try{
                    wouldBeSameAs = new URLData( strURL, urlBaseInfo );
                }
                catch( MalformedURLException urlEx ){
                    System.err.println(getName() + "::processFileURLList()-3 :" + urlEx );
                    continue;
                }
                
                if( wouldBeSameAs.sameAs(url) ){
                    if( WebGlobal.DEBUG )
                        System.err.println(getName() + "::processFileURLList() : new url " + strURL + " with main url base " + urlBaseInfo.getURL() + " would be same as " + url.getURL() );
                    
                    continue;
                }
                
                /*
                 * Here we create the new url with the parent base url since
                 * the link starts at the root.
                 */
                try{
                    newURLInfo = new URLData( strURL, urlBaseInfo );
                }
                catch( MalformedURLException urlEx ){
                    System.err.println(getName() + "::processFileURLList()-3 :" + urlEx );
                    continue;
                }                

                if( WebGlobal.DEBUG )
                    System.err.println(getName() + "::processFileURLList-3 : relative url : urlRef : " + url.getURL() + " | strURL = " + strURL + " | newURL : " + newURLInfo.getURL() );
            
            }
            else{
                try{
                    newURLInfo = new URLData(strURL);
                }
                catch( MalformedURLException urlEx ){
                    System.err.println(getName() + "::processFileURLList() :" + urlEx );
                    continue;
                }                
            }
            
            if( !m_site.isURLAlreadyDone( (URLInfo)newURLInfo) ){
                m_site.addURLDone( (URLInfo)newURLInfo );
                
                /*
                 * Checking if the new url is in the same domain as the other one
                 * if not, we just continue to loop and we don't go fetch it
                 */
                if( url.getHost().compareToIgnoreCase(newURLInfo.getHost()) != 0 ){
                    if( WebGlobal.DEBUG )
                        System.err.println(getName() + "::processFileURLList() : " + newURLInfo.getURL() + " not in same domain ");

                    m_site.addOutsideURL( (URLInfo)newURLInfo );
                    continue;
                }
                else if( newURLInfo.getPort() != url.getPort() ){
                    if( WebGlobal.DEBUG )
                        System.err.println(getName() + "::processFileURLList() : " + newURLInfo.getURL() + " port : " + newURLInfo.getPort() );
                    
                    m_site.addOutsideURL( (URLInfo)newURLInfo );
                    continue;
                }
                else if( !newURLInfo.getProtocol().equalsIgnoreCase(url.getProtocol()) ){
                    if( WebGlobal.DEBUG )
                        System.err.println(getName() + "::processFileURLList() : " + newURLInfo.getURL() + " protocol : " + newURLInfo.getProtocol() );
                    
                    m_site.addOutsideURL( (URLInfo)newURLInfo );
                    continue;
                        
                }
                
                m_todoQueue.add( newURLInfo );
            }
            else{
                if( WebGlobal.DEBUG )
                    System.err.println( getName() + "::processFileURLList() : url already done : " + newURLInfo.getURL() );
            }
        }
    }
//------------------------------------------------------------------------------
    private void processEmailList( FileData fileInfo ){
        ArrayList foundEmails = fileInfo.getFileData().getEmailFound();
        URLData url = fileInfo.getParentURL();
        EmailURL newEmail = null;
        
        for( int i = 0; i < foundEmails.size(); i++ ){
            String emailStr = (String)foundEmails.get(i);
            try{
                newEmail = new EmailURL(emailStr);
                // we add it to the site emails
                m_site.addEmail(newEmail);
            
                //we add it to the file
                url.addEmail(newEmail);
            }
            catch( MalformedURLException urlEx ){
                if( WebGlobal.DEBUG )
                    System.err.println( getName() + "::processEmailList : " + urlEx + " on : " + emailStr );
            }
        }
    }    
//------------------------------------------------------------------------------
    private void addEmailToList( String emailLink, URLData url ){
        EmailURL newEmail = null;
        
        try{
            newEmail = new EmailURL(emailLink);
            m_site.addEmail(newEmail);
            url.addEmail(newEmail);
        }
        catch( MalformedURLException urlEx ){
            if( WebGlobal.DEBUG )
                System.err.println( getName() + "::addEmailToList : " + urlEx + " on : " + emailLink );
        }
    }
//------------------------------------------------------------------------------
    private void processCommentList( FileData fileInfo ){
        ArrayList comments = fileInfo.getFileData().getComments();
        URLData url = fileInfo.getParentURL();
        
        if( comments.size() > 0 )
            url.addCommentList( comments );
    }    
//------------------------------------------------------------------------------
    private void setInfoToSite( URLData url ){
        FileURL newFile = null;
        
        switch( url.getURLType() ){
            case WebGlobal.URL_TYPE_FILE:
                if( WebGlobal.DEBUG )
                    System.err.println( getName() + "::setInfoToSite() : URL_TYPE_FILE for : " + url.getURL() );
                
                try{
                    newFile = new FileURL(url);
                }
                catch( MalformedURLException urlEx ){
                    System.err.println( getName() + "::setInfoToSite() : " + urlEx);
                    return;
                }
                m_site.addFileToFolder(url.getFolder(), newFile);
                break;
            case WebGlobal.URL_TYPE_FOLDER:
                if( WebGlobal.DEBUG )
                    System.err.println(getName() + "::setInfoToSite() : URL_TYPE_FOLDER for : " + url.getURL() );
                
                m_site.addFolder( url.getFolder() );
                break;
            default:
                System.err.println("CODE ERROR->" + getName() + "::setInfoToSite : CRITICAL : unknown URL TYPE SPECIFIED : " + url.getURLType() );
                break;
        }
    }
//------------------------------------------------------------------------------
    public synchronized void waitForProcessing(){
        
        if( WebGlobal.DEBUG )
            System.err.println( getName() + "::waitForProcessing() : m_processing : " + m_processing );
        
        while( m_processing == true ){
            try{
               /*
                *   Release the lock and wait to be notified
                */
                wait();
            }
            catch( InterruptedException intEx ){
                if( WebGlobal.DEBUG )
                    System.err.println( getName() + "::waitForProcessing() : " + intEx );
            }
        }
    }
//------------------------------------------------------------------------------
    private synchronized void setProcessing( boolean value ){
        m_processing = value;
        
        /*
         *  Alert all that processing as changed
         *      - waitForProcessing() needs this
         */
        notifyAll();
    }
//------------------------------------------------------------------------------     
}
