import logging
from urlparse import urlparse
import httplib
import traceback

from parallels.hosting_check import \
        DomainIssue, Severity, WebsiteAvailabilityIssueType
from parallels.hosting_check.utils import http_client
from parallels.hosting_check.utils.html_parser import HtmlParser
from parallels.hosting_check.utils.logwatch import LogWatch
from parallels.hosting_check.messages import MSG

logger = logging.getLogger(__name__)


class WebsiteAvailabilityChecker(object):
    def __init__(self, website_availability_check_timeout=30):
        self.website_availability_check_timeout = website_availability_check_timeout

    def check(self, domains):
        issues = []
        for domain in domains:
            issues += self._check_single_domain(domain)
        return issues

    def _check_single_domain(self, domain):
        issues = []

        checked_urls = CheckedUrls()
        if domain.runner:
            domain.runner.connect()
        try:
            # Check main page
            issues += self._check_domain_relative_url(
                domain, '', checked_urls, check_referred_pages=True
            )

            # Check APS paths
            aps_domain_relative_urls = [
                '%s/' % domain_relative_url 
                for domain_relative_url in domain.aps_domain_relative_urls
            ]
            logger.debug(
                "Checking APS application URLs: '%s'", aps_domain_relative_urls)
            for domain_relative_url in aps_domain_relative_urls:
                if not checked_urls.already_checked(domain_relative_url):
                    issues += self._check_domain_relative_url(
                        domain, domain_relative_url, checked_urls)
        finally:
            if domain.runner:
                domain.runner.disconnect()
        return issues

    def _check_domain_relative_url(
        self, domain, domain_relative_url, checked_urls, 
        check_referred_pages=False, allow_redirect=True
    ):
        issues = []

        if not self._url_is_utf_encoded(domain_relative_url):
            failed_url = unicode(domain_relative_url, errors='replace')
            issues.append(DomainIssue(
                domain_name=domain.domain_name, 
                severity=Severity.WARNING, 
                category=WebsiteAvailabilityIssueType.CANNOT_DECODE_URL,
                problem=MSG(
                    'cannot_decode_url', url=failed_url,
                    server_ip=domain.web_server_ip)))
            return issues

        url = u"%s://%s/%s" % (
            domain.protocol, 
            domain.domain_name, 
            self._normalize_path(domain_relative_url)
        )

        logger.debug(u"Performing website check at URL '%s'", url)
        logger.debug(
            u"Target IP is: %s, source IP is: %s",
            domain.web_server_ip, domain.source_web_server_ip
        )

        logwatch = LogWatch(domain)
        try:
            logwatch.start()
            response = self._get_page(
                url, domain.web_server_ip, domain, issues)

            if response is None:
                # server did not response; skipping further checks
                return issues
            
            if domain.source_web_server_ip is not None:
                source_response = self._get_page(
                    url, domain.source_web_server_ip, domain, issues,
                    Severity.WARNING)
            else:
                source_response = None

            if (
                    response is not None and 
                    response.code == 401 and 
                    source_response is not None and 
                    source_response.code == 401
                ):
                # 401 response code is valid in case of password-protected
                # area.  That is a common case for default Plesk site, so we
                # don't consider it as failure. And we should not perform
                # any other kind of checks for such pages.
                return issues

            bad_status_code = self._check_code(
                url, domain, response, source_response, issues
            )
            if not bad_status_code:
                different_title = self._check_title(
                    url, domain, response, source_response, issues
                )
            else:
                # no sense to check HTML titles, as we already know
                # there is some issue with the site
                different_title = False

            if bad_status_code or different_title or not source_response:
                # if there is something indicating that there is an issue with
                # the site - check for signatures for better diagnostics
                logger.debug(
                    u"Searching page contents and error logs for error messages"
                )
                self._check_signatures(
                    url, domain, response, source_response, issues
                )
                self._check_error_logs(logwatch, domain, issues)

            domain_url = '%s://%s/' % (domain.protocol, domain.domain_name)

            if check_referred_pages:
                links = self._get_page_links(
                    domain_url, domain_relative_url, response.body, domain.www_alias_enabled
                )
                for link in links:
                    if not checked_urls.already_checked_group(link):
                        issues += self._check_domain_relative_url(
                            domain, link, 
                            checked_urls, check_referred_pages=False
                        )
                frames = self._get_page_frames(
                    domain_url, domain_relative_url, response.body, domain.www_alias_enabled
                )
                for frame in frames:
                    if not checked_urls.already_checked_recurse(frame):
                        issues += self._check_domain_relative_url(
                            domain, frame, 
                            checked_urls, check_referred_pages=True
                        )

            if allow_redirect and response.code in (301, 302):
                redirect_location = response.header('location')
                if redirect_location:
                    redirect_path = self._get_local_link(
                        domain_url, domain_relative_url, redirect_location, domain.www_alias_enabled
                    )
                    if redirect_path:
                        issues += self._check_domain_relative_url(
                            domain, redirect_path, 
                            checked_urls, check_referred_pages=check_referred_pages, 
                            allow_redirect=False  # not more than one redirect
                        )

            checked_urls.add(domain_relative_url, check_referred_pages)
        except KeyboardInterrupt:
            # for compatibility with python 2.4
            raise
        except Exception, e:
            logger.debug(u"Exception:", exc_info=e)
            issues.append(DomainIssue(
                domain_name=domain.domain_name, 
                severity=Severity.WARNING, 
                category=WebsiteAvailabilityIssueType.INTERNAL_ERROR,
                problem=MSG(
                    'web_availability_internal_error',
                    error_message=str(e), 
                    url=url, server_ip=domain.web_server_ip
                )
            ))

        return issues

    @classmethod
    def _url_is_utf_encoded(cls, url):
        """Return true, if url is a valid UTF-8, otherwise return false."""
        try:
            unicode(url, errors='strict')
            return True
        except UnicodeDecodeError:
            invalid_url = unicode(url, errors='replace')
            logger.debug("Cannot decode the URL '%s'" % invalid_url)
        return False

    @classmethod
    def _normalize_path(cls, path):
        logger.debug("A path to normalize: %s", path)
        normalized_path_sections = []
        for path_part in path.split('/'):
            if path_part == '.':
                continue
            elif path_part == '..':
                if normalized_path_sections:
                    normalized_path_sections.pop()
            else:
                normalized_path_sections.append(path_part)

        normalized_path = '/'.join(normalized_path_sections)

        logger.debug("Normalized path: %s", normalized_path)
        return normalized_path

    @classmethod
    def _get_page_links(cls, domain_url, path, page_contents, www_alias_enabled=False):
        """Return a list of unique relative links found on page."""
        links = HtmlParser.get_tag_attributes('a', ['href'], page_contents)
        return cls._get_local_links(domain_url, path, set(links), www_alias_enabled)

    @classmethod
    def _get_page_frames(cls, domain_url, path, page_contents, www_alias_enabled=False):
        """Return a list of unique relative frames references found on page."""
        frames = HtmlParser.get_tag_attributes('frame', ['src'], page_contents)
        return cls._get_local_links(domain_url, path, set(frames), www_alias_enabled)

    @classmethod
    def _get_local_links(cls, domain_url, path, url_list, www_alias_enabled=False):
        """Filter URL list and return a list of local URLs for domain URL.
        
        This method searches for links that point to pages in the same domain.
        Returns:
            List of local relative URLs.
        """
        local_links = []
        for url in url_list:
            link = cls._get_local_link(domain_url, path, url, www_alias_enabled)
            if link is not None:
                local_links.append(link)
        return local_links

    @classmethod
    def _get_local_link(cls, domain_url, path, url, www_alias_enabled=False):
        homepage_url = urlparse(domain_url)

        (url_protocol, url_host, url_path) = urlparse(url)[:3]
        is_local = url_protocol == '' and url_host == '' 
        if is_local:
            if url.startswith('/'):  # relative to domains's root
                return url.lstrip('/')
            elif '/' not in path:  # we're in domains's root
                return url
            else:  # relative to current path, which is not domain's root
                path_dir = path[:path.rfind('/')]
                return '%s/%s' % (path_dir, url)
        same_domain = (
            url_protocol == homepage_url[0] and (
                url_host == homepage_url[1] or (
                    www_alias_enabled and
                    url_host == "www.%s" % homepage_url[1]
                )
            )
        )
        if same_domain:
            return url_path.lstrip('/')

    def _get_page(self, url, ip, domain, issues, severity=Severity.ERROR):
        """Return HTTP response"""
        try:
            client = http_client.HttpClient(self.website_availability_check_timeout)
            return client.get(url, ip)
        except httplib.IncompleteRead, e:
            logger.debug("Error:\n%s", traceback.format_exc())
            issues.append(DomainIssue(
                domain_name=domain.domain_name, 
                severity=severity,
                category=WebsiteAvailabilityIssueType.FAILED_HTTP_REQUEST,
                problem=MSG(
                    'web_availability_request_error',
                    server_ip=ip, url=url, error_message=str(e)
                )
            ))
            return None
            
        except IOError, e:
            logger.debug("Error:\n%s", traceback.format_exc())
            issues.append(DomainIssue(
                domain_name=domain.domain_name, 
                severity=severity,
                category=WebsiteAvailabilityIssueType.CONNECTION_ISSUE,
                problem=MSG(
                    'web_availability_failed_to_connect',
                    server_ip=ip, url=url, error_message=str(e)
                )
            ))
            return None

    @classmethod
    def _check_code(
        cls, url, domain,
        response, source_response,
        issues
    ):
        """
        Returns:
            True if there are issues with status code,
            False otherwise 
        """
        status_code_errors = {
            5: WebsiteAvailabilityIssueType.STATUS_CODE_5xx,
            4: WebsiteAvailabilityIssueType.STATUS_CODE_4xx,
        }

        if (
            source_response is not None and
            not cls._status_codes_equal(source_response.code, response.code)
        ):
            issues.append(DomainIssue(
                domain_name=domain.domain_name, 
                severity=Severity.ERROR, 
                category=WebsiteAvailabilityIssueType.DIFFERENT_STATUS_CODE,
                problem=MSG(
                    'web_availability_different_http_code',
                    target_server_ip=domain.web_server_ip,
                    target_status_code=response.code, 
                    source_server_ip=domain.source_web_server_ip,
                    source_status_code=source_response.code, 
                    url=url
                )
            ))
            return True

        if response.code_class in status_code_errors:
            if source_response is None:
                issues.append(DomainIssue(
                    domain_name=domain.domain_name,
                    severity=Severity.ERROR,
                    category=status_code_errors[response.code_class],
                    problem=MSG(
                        'web_availability_webserver_returned_http_code_target',
                        server_ip=domain.web_server_ip,
                        status_code=response.code,
                        url=url
                    )
                ))
            else:
                issues.append(DomainIssue(
                    domain_name=domain.domain_name,
                    severity=Severity.INFO,
                    category=status_code_errors[response.code_class],
                    problem=MSG(
                        'web_availability_webserver_returned_http_code_both',
                        source_server_ip=domain.source_web_server_ip,
                        target_server_ip=domain.web_server_ip,
                        status_code=response.code,
                        url=url
                    )
                ))

            return True

        return False

    @staticmethod
    def _status_codes_equal(source_code, target_code):
        """Check if we consider HTTP status codes equal
        """

        # During migration 301/302 codes may be exchanged,
        # that is an often situation when migrating from old Plesks
        # so we consider the equal
        redirect_codes = (301, 302)
        if source_code in redirect_codes and target_code in redirect_codes:
            return True
        return source_code == target_code

    @staticmethod
    def _check_signatures(
        url, domain, 
        response, source_response,
        issues
    ):
        """Look web pages for signatures - indicators of an issue.

        For example, signature of a problem is "PHP Fatal error"
        string in page contents. Function checks both target
        and source page (if available), and emits different issues
        if signature was found on source server or not.

        Arguments:
        - domain - instance of DomainWebService with info
        about particular domain to perform signatures check; list 
        of signature objects (instance of Signature class) are provided
        here too, to make possible check different sites for different 
        signatures
        - response - response from a (target) server
        - source_response - response from a source server;
        may be None, if we have single server only, or there were issues
        fetching the page
        - issues - list of issues where we should add new found issues
        """
        for signature in domain.signatures:
            page_signature_text = signature.find(response.body)
            if source_response is not None:
                source_page_signature_text = signature.find(
                    source_response.body
                )
                if source_page_signature_text is not None:
                    issues.append(DomainIssue(
                        domain_name=domain.domain_name, 
                        severity=Severity.ERROR, 
                        category=signature.problem_found_on_source, 
                        problem=MSG(
                            'web_availability_signature_found_on_source',
                            url=url, 
                            error_message=source_page_signature_text
                        )
                    ))
                elif page_signature_text is not None:
                    issues.append(DomainIssue(
                        domain_name=domain.domain_name, 
                        severity=Severity.ERROR, 
                        category=signature.problem_found_on_target_only, 
                        problem=MSG(
                            'web_availability_signature_found_on_target',
                            url=url, error_message=page_signature_text
                        )
                    ))
            else:
                if page_signature_text is not None:
                    issues.append(DomainIssue(
                        domain_name=domain.domain_name, 
                        severity=Severity.ERROR, 
                        category=signature.problem_found_on_target_only, 
                        problem=MSG(
                            'web_availability_signature_found_on_target', 
                            url=url, error_message=page_signature_text
                        )
                    ))

    @staticmethod
    def _check_title(
        url, domain, 
        response, source_response,
        issues
    ):
        """Compare HTML titles of migrated webpage.

        Arguments:
            url: URL, which is being checked
            domain: DomainWebService object
            response: target server response for URL
            source_response: source server response for URL
            issues: a list of issues to be updated
        Returns:
            True if there are issues with title,
            False otherwise 
        """
        if source_response is None: 
            # check is applicable for migration scenarios, 
            # when both source and target servers are available
            return

        titles = []
        for page in [source_response.body, response.body]:
            title = HtmlParser.get_tag_content('title', page)
            titles.append(title)

        error_text = None
        if len(titles[0]) > len(titles[1]):
            error_text = MSG(
                'web_availability_title_missing_target', url=url
            )
        elif len(titles[0]) < len(titles[1]):
            error_text = MSG(
                'web_availability_title_missing_source', url=url
            )
        elif not titles[0] == titles[1]:
            def is_plesk_frame_forwarding(response_body):
                return 'Your browser does not support frames' in response_body

            def equal_idn_insensitive(list1, list2):
                list1_idna = []
                list2_idna = []
                for item in list1:
                    list1_idna.append(_safe_string_decode(item).encode('idna'))
                for item in list2:
                    list2_idna.append(_safe_string_decode(item).encode('idna'))
                return list1_idna == list2_idna

            if not (
                # Check for special case - frame forwarding migration from Plesk 9.x
                # when domain has national symbols. In that case, title of page is updated -
                # on source Plesk 9.x you have IDN-encoded name, on target Plesk you have national symbols
                # in UTF-8 in page title
                is_plesk_frame_forwarding(source_response.body) and
                equal_idn_insensitive(titles[0], titles[1])
            ):
                def compose_title(title_list):
                    return " ".join([
                        _safe_string_decode(title)
                        for title in title_list
                    ])

                error_text = MSG(
                    'web_availability_title_changed',
                    url=url,
                    title_source=compose_title(titles[0]),
                    title_target=compose_title(titles[1])
                )

        if error_text:
            issue = DomainIssue(
                domain_name=domain.domain_name, 
                severity=Severity.ERROR, 
                category=WebsiteAvailabilityIssueType.DIFFERENT_TITLES, 
                problem=error_text,
            )
            issues.append(issue)
            return True 

        return False

    @classmethod
    def _check_error_logs(cls, logwatch, domain, issues):
        """Search in domain logs for errors; add errors to the list of issues"""
        for filename in logwatch.get_log_files():
            text = logwatch.get_new_records(filename) 
            for signature in domain.signatures:
                found_text = signature.find(text)
                if found_text is not None:
                    issues.append(DomainIssue(
                        domain_name=domain.domain_name, 
                        severity=Severity.ERROR, 
                        category=signature.problem_found_on_target_only, 
                        problem=MSG(
                            'web_availability_signature_found_in_target_log',
                            error_log_file=filename,
                            error_message=found_text
                        )
                    ))


class CheckedUrls(object):
    """Store which URLs were already checked to reduce number of checks"""

    def __init__(self):
        self.checked_urls = []

    def add(self, url, recurse=False):
        """Mark URL as checked

        url (string) - relative URL 
        recurse (boolean) - whether URL was checked recursively 
            (which means that all links of that page 
            were parsed and checked too)
        
        """
        self.checked_urls.append((url, recurse))

    def already_checked(self, url):
        """Return whether URL was already checked
        
        If URL was checked (no differense - recursively or not) - return True.
        Otherwise return False.
        """
        return url in [
            checked_url for checked_url, _ in self.checked_urls
        ]

    def already_checked_group(self, url):
        """Return whether URL of the same group was already checked

        We perform grouping to reduce number of checks. By group
        we mean pair (directory, extension). If you check each of such
        groups at least once, you have good probability to have:
        - scripting settings of each directory checked
        - permissions of each directory checked

        So, for example if you have pages '/news/index.php' and
        'news/top-news.php', we consider that it is enought to check only one
        of them.
        """
        checked_url_groups = set(
            self._classify_url(checked_url)
            for checked_url, _ in self.checked_urls
        )
        url_group = self._classify_url(url)

        return url_group in checked_url_groups

    def already_checked_recurse(self, url):
        """Return whether URL was already checked recursively
        
        If URL was checked recursively - return True.
        Otherwise return False.
        """
        return url in [checked_url for checked_url, recurse in self.checked_urls if recurse]

    @staticmethod
    def _classify_url(url):
        """Classify URL by group to reduce number of checked URLs.
        
        Return a tuple of (directory, extension).
        
        Example:
            '/old/news/index.php'
            ->
            ('/old/news', 'php')
        """
        if url == '':
            return '/', '', ''
        
        # Get path excluding URL parameters and fragments
        # So for example:
        # - for URL 'test/index.php?page=news' you get 'test/index.php'
        # - for URL 'index.php#top' you get 'index.php'
        path = urlparse(url)[2]

        if '/' in path:
            dirname, _ = path.rsplit('/', 1) 
        else:
            dirname = ''
        ext_index = path.rfind('.')
        if ext_index != -1:
            extension = path[ext_index:]
        else:
            extension = ''

        return dirname, extension


def _safe_string_decode(string):
    """Decode string to unicode, fallback to Python's repr in case of failure
    
    If string can be interpreted as UTF-8, then return it as unicode string.
    Otherwise return its __repr__.  Usually you want to get just plain string,
    but if there are some issues with encoding (for example, server returned
    some binary data) - at least get its Python representation.

    Consider that website page may be encoded in another encoding, for example,
    CP1251, or KOI8-R. This function is very simple, it does not handle this
    situation at all.  Better way is to use some encoding detection library.
    """
    try:
        return u"%s" % (string.decode('utf-8'),)
    except UnicodeError:
        return u"%r" % (string,)
