# coding: utf8
"""Rule based Sentence tokenization module"""

# Global Variables
_URDU_CONJUNCTIONS = ['جنہیں', 'جس', 'جن', 'جو', 'اور', 'اگر', 'اگرچہ', 'لیکن', 'مگر', 'پر', 'یا', 'تاہم', 'کہ', 'کر',
                      'تو', 'گے', 'گی']
_URDU_NEWLINE_WORDS = ['کیجیے', 'کیجئے', 'گئیں', 'تھیں', 'ہوں', 'خریدا', 'گے', 'ہونگے', 'گا', 'چاہیے', 'ہوئیں', 'گی',
                       'تھا', 'تھی', 'تھے', 'ہیں', 'ہے',
                       ]


def _split_and_keep(_str, separator):
    """Replace end of sentence with separator"""
    if not _str:
        return []
    max_p = chr(ord(max(_str)) + 1)
    return _str.replace(separator, separator + max_p).split(max_p)


def _generate_sentences(text: str) -> list:
    """Generate a list of urdu sentences from a given string.
    This function automatically fixes multiple whitespaces
    or new lines so you just need to pass the data and
    get sentences in return.

    Args:
        text (str): base string
    Returns:
        list
    """
    all_sentences = []
    sentences = _split_and_keep(text, '۔')

    for sentence in sentences:  # pylint: disable=too-many-nested-blocks
        if sentence and (len(sentence.split()) >= 2):
            if '؟' in sentence:
                q_sentences = _split_and_keep(sentence, '؟')
                for _sen in q_sentences:
                    _sen = _sen.split()
                    new_sent = ""
                    is_cont = False

                    for index, word in enumerate(_sen):
                        if is_cont:
                            is_cont = False
                            continue

                        if word in _URDU_NEWLINE_WORDS and index + 1 < len(
                                _sen) and _sen[index + 1] not in _URDU_CONJUNCTIONS:

                            if index + 1 < len(_sen) and _sen[index + 1] in ["۔", "،"]:
                                new_sent += " " + word + " " + _sen[index + 1] + "\n"
                                is_cont = True
                            else:
                                new_sent += " " + word + "\n"

                        else:
                            new_sent += " " + word

                    for sen in new_sent.split("\n"):
                        if sen and len(sen.split()) >= 2:
                            all_sentences.append(sen.strip())

            else:
                sentence = sentence.split()
                new_sent = ""
                is_cont = False

                for index, word in enumerate(sentence):
                    if is_cont:
                        is_cont = False
                        continue

                    if word in _URDU_NEWLINE_WORDS and index + 1 < len(
                            sentence) and sentence[index + 1] not in _URDU_CONJUNCTIONS:

                        if index + 1 < len(sentence) and sentence[index + 1] in ["۔", "،"]:
                            new_sent += " " + word + " " + sentence[index + 1] + "\n"
                            is_cont = True
                        else:
                            new_sent += " " + word + "\n"
                    else:
                        new_sent += " " + word

                for sen in new_sent.split("\n"):
                    if sen and len(sen.split()) >= 2:
                        all_sentences.append(sen.strip())

    return all_sentences