package org.apache.tika.parser.microsoft;

import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.derby.catalog.Dependable;
import org.apache.pdfbox.pdmodel.common.PDPageLabelRange;
import org.apache.pdfbox.pdmodel.documentinterchange.taggedpdf.PDLayoutAttributeObject;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.HWPFOldDocument;
import org.apache.poi.hwpf.OldWordFileFormatException;
import org.apache.poi.hwpf.extractor.Word6Extractor;
import org.apache.poi.hwpf.model.PicturesTable;
import org.apache.poi.hwpf.usermodel.CharacterRun;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Picture;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.hwpf.usermodel.Table;
import org.apache.poi.hwpf.usermodel.TableCell;
import org.apache.poi.hwpf.usermodel.TableRow;
import org.apache.poi.poifs.filesystem.DirectoryEntry;
import org.apache.poi.poifs.filesystem.Entry;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.SAXException;

/* loaded from: input_file:org/apache/tika/parser/microsoft/WordExtractor.class */
public class WordExtractor extends AbstractPOIFSExtractor {

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:org/apache/tika/parser/microsoft/WordExtractor$PicturesSource.class */
    public static class PicturesSource {
        private PicturesTable picturesTable;
        private Set<Picture> output;
        private Map<Integer, Picture> lookup;
        private List<Picture> nonU1based;
        private List<Picture> all;
        private int pn;

        private PicturesSource(HWPFDocument hWPFDocument) {
            this.output = new HashSet();
            this.pn = 0;
            this.picturesTable = hWPFDocument.getPicturesTable();
            this.all = this.picturesTable.getAllPictures();
            this.lookup = new HashMap();
            for (Picture picture : this.all) {
                String suggestFullFileName = picture.suggestFullFileName();
                if (suggestFullFileName.indexOf(46) > -1) {
                    suggestFullFileName = suggestFullFileName.substring(0, suggestFullFileName.indexOf(46));
                }
                this.lookup.put(Integer.valueOf(Integer.parseInt(suggestFullFileName, 16)), picture);
            }
            this.nonU1based = new ArrayList();
            this.nonU1based.addAll(this.all);
            Range range = hWPFDocument.getRange();
            for (int i = 0; i < range.numCharacterRuns(); i++) {
                CharacterRun characterRun = range.getCharacterRun(i);
                if (this.picturesTable.hasPicture(characterRun)) {
                    this.nonU1based.set(this.nonU1based.indexOf(getFor(characterRun)), null);
                }
            }
        }

        /* JADX INFO: Access modifiers changed from: private */
        public boolean hasPicture(CharacterRun characterRun) {
            return this.picturesTable.hasPicture(characterRun);
        }

        /* JADX INFO: Access modifiers changed from: private */
        public void recordOutput(Picture picture) {
            this.output.add(picture);
        }

        /* JADX INFO: Access modifiers changed from: private */
        public boolean hasOutput(Picture picture) {
            return this.output.contains(picture);
        }

        /* JADX INFO: Access modifiers changed from: private */
        public int pictureNumber(Picture picture) {
            return this.all.indexOf(picture) + 1;
        }

        /* JADX INFO: Access modifiers changed from: private */
        public Picture getFor(CharacterRun characterRun) {
            return this.lookup.get(Integer.valueOf(characterRun.getPicOffset()));
        }

        /* JADX INFO: Access modifiers changed from: private */
        public Picture nextUnclaimed() {
            while (this.pn < this.nonU1based.size()) {
                Picture picture = this.nonU1based.get(this.pn);
                this.pn++;
                if (picture != null) {
                    return picture;
                }
            }
            return null;
        }
    }

    /* loaded from: input_file:org/apache/tika/parser/microsoft/WordExtractor$TagAndStyle.class */
    public static class TagAndStyle {
        private String tag;
        private String styleClass;

        public TagAndStyle(String str, String str2) {
            this.tag = str;
            this.styleClass = str2;
        }

        public String getTag() {
            return this.tag;
        }

        public String getStyleClass() {
            return this.styleClass;
        }

        public boolean isHeading() {
            return this.tag.length() == 2 && this.tag.startsWith("h");
        }
    }

    public WordExtractor(ParseContext parseContext) {
        super(parseContext);
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public void parse(POIFSFileSystem pOIFSFileSystem, XHTMLContentHandler xHTMLContentHandler) throws IOException, SAXException, TikaException {
        try {
            HWPFDocument hWPFDocument = new HWPFDocument(pOIFSFileSystem);
            org.apache.poi.hwpf.extractor.WordExtractor wordExtractor = new org.apache.poi.hwpf.extractor.WordExtractor(hWPFDocument);
            addTextIfAny(xHTMLContentHandler, "header", wordExtractor.getHeaderText());
            PicturesTable picturesTable = hWPFDocument.getPicturesTable();
            PicturesSource picturesSource = new PicturesSource(hWPFDocument);
            Range range = hWPFDocument.getRange();
            int i = 0;
            while (i < range.numParagraphs()) {
                i = i + handleParagraph(range.getParagraph(i), 0, range, hWPFDocument, picturesSource, picturesTable, xHTMLContentHandler) + 1;
            }
            for (String str : wordExtractor.getFootnoteText()) {
                xHTMLContentHandler.element("p", str);
            }
            for (String str2 : wordExtractor.getCommentsText()) {
                xHTMLContentHandler.element("p", str2);
            }
            for (String str3 : wordExtractor.getEndnoteText()) {
                xHTMLContentHandler.element("p", str3);
            }
            addTextIfAny(xHTMLContentHandler, "footer", wordExtractor.getFooterText());
            Picture nextUnclaimed = picturesSource.nextUnclaimed();
            while (true) {
                Picture picture = nextUnclaimed;
                if (picture == null) {
                    try {
                        break;
                    } catch (FileNotFoundException e) {
                        return;
                    }
                } else {
                    handlePictureCharacterRun(null, picture, picturesSource, xHTMLContentHandler);
                    nextUnclaimed = picturesSource.nextUnclaimed();
                }
            }
            for (Entry entry : (DirectoryEntry) pOIFSFileSystem.getRoot().getEntry("ObjectPool")) {
                if (entry.getName().startsWith("_") && (entry instanceof DirectoryEntry)) {
                    handleEmbededOfficeDoc((DirectoryEntry) entry, xHTMLContentHandler);
                }
            }
        } catch (OldWordFileFormatException e2) {
            parseWord6(pOIFSFileSystem, xHTMLContentHandler);
        }
    }

    private int handleParagraph(Paragraph paragraph, int i, Range range, HWPFDocument hWPFDocument, PicturesSource picturesSource, PicturesTable picturesTable, XHTMLContentHandler xHTMLContentHandler) throws SAXException, IOException, TikaException {
        if (!paragraph.isInTable() || paragraph.getTableLevel() <= i || i != 0) {
            TagAndStyle buildParagraphTagAndStyle = buildParagraphTagAndStyle(hWPFDocument.getStyleSheet().getStyleDescription(paragraph.getStyleIndex()).getName(), i > 0);
            if (buildParagraphTagAndStyle.getStyleClass() != null) {
                xHTMLContentHandler.startElement(buildParagraphTagAndStyle.getTag(), "class", buildParagraphTagAndStyle.getStyleClass());
            } else {
                xHTMLContentHandler.startElement(buildParagraphTagAndStyle.getTag());
            }
            int i2 = 0;
            while (i2 < paragraph.numCharacterRuns()) {
                CharacterRun characterRun = paragraph.getCharacterRun(i2);
                if (characterRun.text().equals("\u0013")) {
                    i2 += handleSpecialCharacterRuns(paragraph, i2, buildParagraphTagAndStyle.isHeading(), picturesSource, xHTMLContentHandler);
                } else if (characterRun.text().startsWith("\b")) {
                    for (int i3 = 0; i3 < characterRun.text().length(); i3++) {
                        handlePictureCharacterRun(characterRun, picturesSource.nextUnclaimed(), picturesSource, xHTMLContentHandler);
                    }
                } else if (picturesTable.hasPicture(characterRun)) {
                    handlePictureCharacterRun(characterRun, picturesSource.getFor(characterRun), picturesSource, xHTMLContentHandler);
                } else {
                    handleCharacterRun(characterRun, buildParagraphTagAndStyle.isHeading(), xHTMLContentHandler);
                }
                i2++;
            }
            xHTMLContentHandler.endElement(buildParagraphTagAndStyle.getTag());
            return 0;
        }
        Table table = range.getTable(paragraph);
        xHTMLContentHandler.startElement("table");
        xHTMLContentHandler.startElement("tbody");
        for (int i4 = 0; i4 < table.numRows(); i4++) {
            TableRow row = table.getRow(i4);
            xHTMLContentHandler.startElement("tr");
            for (int i5 = 0; i5 < row.numCells(); i5++) {
                TableCell cell = row.getCell(i5);
                xHTMLContentHandler.startElement("td");
                for (int i6 = 0; i6 < cell.numParagraphs(); i6++) {
                    handleParagraph(cell.getParagraph(i6), paragraph.getTableLevel(), cell, hWPFDocument, picturesSource, picturesTable, xHTMLContentHandler);
                }
                xHTMLContentHandler.endElement("td");
            }
            xHTMLContentHandler.endElement("tr");
        }
        xHTMLContentHandler.endElement("tbody");
        xHTMLContentHandler.endElement("table");
        return table.numParagraphs() - 1;
    }

    private void handleCharacterRun(CharacterRun characterRun, boolean z, XHTMLContentHandler xHTMLContentHandler) throws SAXException {
        if (characterRun.text().equals("\r")) {
            return;
        }
        ArrayList arrayList = new ArrayList();
        if (!z) {
            if (characterRun.isBold()) {
                arrayList.add("b");
            }
            if (characterRun.isItalic()) {
                arrayList.add("i");
            }
            if (characterRun.isStrikeThrough()) {
                arrayList.add("s");
            }
            Iterator it = arrayList.iterator();
            while (it.hasNext()) {
                xHTMLContentHandler.startElement((String) it.next());
            }
        }
        String replace = characterRun.text().replace('\r', '\n');
        if (replace.endsWith("\u0007")) {
            replace = replace.substring(0, replace.length() - 1);
        }
        xHTMLContentHandler.characters(replace);
        for (int size = arrayList.size() - 1; size >= 0; size--) {
            xHTMLContentHandler.endElement((String) arrayList.get(size));
        }
    }

    private int handleSpecialCharacterRuns(Paragraph paragraph, int i, boolean z, PicturesSource picturesSource, XHTMLContentHandler xHTMLContentHandler) throws SAXException, TikaException, IOException {
        ArrayList arrayList = new ArrayList();
        ArrayList<CharacterRun> arrayList2 = new ArrayList();
        boolean z2 = false;
        int i2 = i + 1;
        while (true) {
            if (i2 >= paragraph.numCharacterRuns()) {
                break;
            }
            CharacterRun characterRun = paragraph.getCharacterRun(i2);
            if (characterRun.text().equals("\u0013")) {
                i2 += handleSpecialCharacterRuns(paragraph, i2 + 1, z, picturesSource, xHTMLContentHandler);
            } else if (characterRun.text().equals("\u0014")) {
                z2 = true;
            } else if (characterRun.text().equals("\u0015")) {
                if (!z2) {
                    arrayList2 = arrayList;
                    arrayList = new ArrayList();
                }
            } else if (z2) {
                arrayList2.add(characterRun);
            } else {
                arrayList.add(characterRun);
            }
            i2++;
        }
        if (arrayList.size() > 0) {
            String text = ((CharacterRun) arrayList.get(0)).text();
            for (int i3 = 1; i3 < arrayList.size(); i3++) {
                text = text + ((CharacterRun) arrayList.get(i3)).text();
            }
            if (!text.startsWith("HYPERLINK") || text.indexOf(34) <= -1) {
                for (CharacterRun characterRun2 : arrayList2) {
                    if (picturesSource.hasPicture(characterRun2)) {
                        handlePictureCharacterRun(characterRun2, picturesSource.getFor(characterRun2), picturesSource, xHTMLContentHandler);
                    } else {
                        handleCharacterRun(characterRun2, z, xHTMLContentHandler);
                    }
                }
            } else {
                xHTMLContentHandler.startElement(PDPageLabelRange.STYLE_LETTERS_LOWER, "href", text.substring(text.indexOf(34) + 1, text.lastIndexOf(34)));
                Iterator it = arrayList2.iterator();
                while (it.hasNext()) {
                    handleCharacterRun((CharacterRun) it.next(), z, xHTMLContentHandler);
                }
                xHTMLContentHandler.endElement(PDPageLabelRange.STYLE_LETTERS_LOWER);
            }
        } else {
            Iterator it2 = arrayList2.iterator();
            while (it2.hasNext()) {
                handleCharacterRun((CharacterRun) it2.next(), z, xHTMLContentHandler);
            }
        }
        return i2 - i;
    }

    private void handlePictureCharacterRun(CharacterRun characterRun, Picture picture, PicturesSource picturesSource, XHTMLContentHandler xHTMLContentHandler) throws SAXException, IOException, TikaException {
        if (picture == null) {
            return;
        }
        String suggestFileExtension = picture.suggestFileExtension();
        String str = "image" + picturesSource.pictureNumber(picture) + (suggestFileExtension.length() > 0 ? "." + suggestFileExtension : "");
        String mimeType = picture.getMimeType();
        xHTMLContentHandler.startElement("img", "src", "embedded:" + str);
        xHTMLContentHandler.endElement("img");
        if (picturesSource.hasOutput(picture)) {
            return;
        }
        handleEmbeddedResource(TikaInputStream.get(picture.getContent()), str, mimeType, xHTMLContentHandler, false);
        picturesSource.recordOutput(picture);
    }

    private void addTextIfAny(XHTMLContentHandler xHTMLContentHandler, String str, String str2) throws SAXException {
        if (str2 == null || str2.length() <= 0) {
            return;
        }
        xHTMLContentHandler.startElement("div", "class", str);
        xHTMLContentHandler.element("p", str2);
        xHTMLContentHandler.endElement("div");
    }

    protected void parseWord6(POIFSFileSystem pOIFSFileSystem, XHTMLContentHandler xHTMLContentHandler) throws IOException, SAXException, TikaException {
        for (String str : new Word6Extractor(new HWPFOldDocument(pOIFSFileSystem)).getParagraphText()) {
            xHTMLContentHandler.element("p", str);
        }
    }

    public static TagAndStyle buildParagraphTagAndStyle(String str, boolean z) {
        String str2 = "p";
        String str3 = null;
        if (!str.equals(Dependable.DEFAULT) && !str.equals(PDLayoutAttributeObject.LINE_HEIGHT_NORMAL) && (!str.equals("Table Contents") || !z)) {
            if (str.equals("Heading")) {
                str2 = "h1";
            } else if (str.startsWith("Heading ")) {
                int i = 1;
                try {
                    i = Integer.parseInt(str.substring(str.length() - 1));
                } catch (NumberFormatException e) {
                }
                str2 = "h" + i;
            } else if (str.equals("Title")) {
                str2 = "h1";
                str3 = "title";
            } else if (str.equals("Subtitle")) {
                str2 = "h2";
                str3 = "subtitle";
            } else if (str.equals("HTML Preformatted")) {
                str2 = "pre";
            } else {
                String replace = str.replace(' ', '_');
                str3 = replace.substring(0, 1).toLowerCase() + replace.substring(1);
            }
        }
        return new TagAndStyle(str2, str3);
    }
}
