package org.caudexorigo.text;

import java.net.URL;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang3.StringEscapeUtils;
import org.apache.commons.lang3.StringUtils;
import org.caudexorigo.io.IOUtils;

/* loaded from: input_file:org/caudexorigo/text/HtmlStripper.class */
public class HtmlStripper {
    private static final Pattern breaker = Pattern.compile("(<blockquote|<center|<div|<p|<br|<h\\d|<ul|<dl|<ol|<hr|<table)", 2);
    private static final Pattern markup_cleaner = Pattern.compile("<xml.*?xml>|<style.*?style>|<script.*?script>|<.*?>", 34);
    private static final Pattern space_cleaner = Pattern.compile("[\\u2002\\u2003\\u2004\\u2005\\u2006\\u2007\\u2008\\u2009\\u200a\\u00a0\\u1680\\u000b\\u0020\\u00a0\\u1680\\u202f\\u205f\\u3000\\u0009]");
    private static final Pattern nl_cleaner = Pattern.compile("[\r\\u000a\\u000c\\u000d\\u0085\\u2028\\u2029]");
    private static final Pattern trim_space = Pattern.compile("^[ \t]+|[ \t]+$", 8);
    private static final Pattern multi_space = Pattern.compile("[ \t]{2,}");
    private static final Pattern multi_ln = Pattern.compile("[\\x0B\n]{2,}");

    public static String strip(String str) {
        if (StringUtils.isBlank(str)) {
            return "";
        }
        Matcher matcher = breaker.matcher(str);
        return multi_space.matcher(multi_ln.matcher(trim_space.matcher(nl_cleaner.matcher(space_cleaner.matcher(StringEscapeUtils.unescapeHtml4(markup_cleaner.matcher(matcher.find() ? matcher.replaceAll("\n" + matcher.group(1)) : str).replaceAll(" "))).replaceAll(" ")).replaceAll("\n")).replaceAll("")).replaceAll("\n")).replaceAll(" ").trim();
    }

    public static void main(String[] strArr) throws Throwable {
        System.out.println(strip(IOUtils.toString(new URL("http://technotes.blogs.sapo.pt/").openStream())));
    }
}
