// Author: maple // date: 9/24/25 package org.openautonomousconnection.htmlparser; import dev.unlegitdqrk.unlegitlibrary.string.StringUtils; import org.openautonomousconnection.htmlparser.html.body.misc.HTMLComment; import lombok.Getter; import lombok.Setter; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; public class DocumentBuilder { @Getter @Setter protected String content; @Getter protected List comments; @Getter protected List attributes, texts, tags; public DocumentBuilder(String content) { this.content = content; //content.replace("\n", ""); this.comments = new ArrayList<>(); this.attributes = new ArrayList<>(); this.texts = new ArrayList<>(); } /** * Extracts all comments and strings into lists */ public void extract() { this.extractComments(); this.extractStringsAndAttributes(); this.extractTexts(); } /** * inserts the extracts back into the content string */ public void insert() { this.insertTexts(); this.insertStringsAndAttributes(); this.insertComments(); } protected void extractComments() { Pattern pattern = Pattern.compile("", Pattern.DOTALL); Matcher matcher = pattern.matcher(content); int index = 0; while (matcher.find()) { this.content = this.content.replace("", ""); this.comments.add(new HTMLComment(matcher.group(1))); index++; } } protected void insertComments() { if(this.comments.isEmpty()) return; int i = 0; for(; i < this.comments.size(); i++) this.content = this.content.replace("", this.comments.get(i).toString()); for(; i > 0; i--) this.comments.removeFirst(); } protected void extractStringsAndAttributes() { Pattern pattern = Pattern.compile("\"(.*?)\"|'(.*?)'", Pattern.DOTALL); Matcher matcher = pattern.matcher(this.content); int index = 0; while (matcher.find()) { if(matcher.group(1) != null) { this.content = this.content.replace("\"" + matcher.group(1) + "\"", "\"S" + index + "\""); this.attributes.add(matcher.group(1)); } else { this.content = this.content.replace("'" + matcher.group(2) + "'", "'S" + index + "'"); this.attributes.add(matcher.group(2)); } index++; } } protected void insertStringsAndAttributes() { if(this.attributes.isEmpty()) return; int i = 0; for(; i < this.attributes.size(); i++) { this.content = this.content.replace("\"S" + i + "\"", "\"" + attributes.get(i) + "\""); this.content = this.content.replace("'S" + i + "'", "'" + attributes.get(i) + "'"); } for(; i > 0; i--) this.attributes.removeFirst(); } protected void extractTexts() { Pattern pattern = Pattern.compile(">([^<]+)(?=<)", Pattern.DOTALL); Matcher matcher = pattern.matcher(content); int index = 0; while (matcher.find()) { if(StringUtils.isEmptyString(matcher.group(1))) continue; this.content = this.content.replace(">" + matcher.group(1) + "<", ">T" + index + "<"); this.texts.add(matcher.group(1)); index++; } } protected void insertTexts() { if(this.texts.isEmpty()) return; int i = 0; for(; i < this.texts.size(); i++) this.content = this.content.replace(">T" + i + "<", ">" + this.texts.get(i) + "<"); for(; i > 0; i--) this.texts.removeFirst(); } protected void extractTags() { } }