package com.marklogic.contentpump;

import com.marklogic.contentpump.utilities.CSVParserFormatter;
import com.marklogic.contentpump.utilities.DelimitedSplit;
import com.marklogic.contentpump.utilities.IdGenerator;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import org.apache.commons.csv.CSVParser;
import org.apache.commons.csv.CSVRecord;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

/* loaded from: input_file:com/marklogic/contentpump/SplitDelimitedTextReader.class */
public class SplitDelimitedTextReader<VALUEIN> extends DelimitedTextReader<VALUEIN> {
    public static final Log LOG = LogFactory.getLog(SplitDelimitedTextReader.class);
    private long start;
    private long end;
    private String lineSeparator;

    @Override // com.marklogic.contentpump.DelimitedTextReader, com.marklogic.contentpump.ImportRecordReader
    public void initialize(InputSplit inputSplit, TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
        initConfig(taskAttemptContext);
        initDocType();
        initDelimConf();
        setFile(((FileSplit) inputSplit).getPath());
        this.fs = this.file.getFileSystem(taskAttemptContext.getConfiguration());
        this.start = ((DelimitedSplit) inputSplit).getStart();
        this.end = this.start + ((DelimitedSplit) inputSplit).getLength();
        initParser(inputSplit);
    }

    @Override // com.marklogic.contentpump.DelimitedTextReader, com.marklogic.contentpump.ImportRecordReader
    public boolean nextKeyValue() throws IOException, InterruptedException {
        if (this.parser == null || this.parserIterator == null) {
            return false;
        }
        try {
            if (!this.parserIterator.hasNext()) {
                this.bytesRead = this.fileLen;
                return false;
            }
            CSVRecord recordLine = getRecordLine();
            if (recordLine.getCharacterByte() >= this.end) {
                return false;
            }
            String[] line = getLine(recordLine);
            if (line.length != this.fields.length) {
                setSkipKey(0, 0, "number of fields does not match number of columns");
                return true;
            }
            this.docBuilder.newDoc();
            for (int i = 0; i < this.fields.length; i++) {
                if (!this.fields[i].equals("")) {
                    if (!this.generateId && this.uriId == i && setKey(line[i], 0, 0, true)) {
                        return true;
                    }
                    try {
                        this.docBuilder.put(this.fields[i], line[i]);
                    } catch (Exception e) {
                        setSkipKey(0, 0, e.getMessage());
                        return true;
                    }
                }
            }
            this.docBuilder.build();
            if (this.generateId && setKey(this.idGen.incrementAndGet(), 0, 0, true)) {
                return true;
            }
            if (this.value instanceof Text) {
                ((Text) this.value).set(this.docBuilder.getDoc());
            } else {
                ((Text) ((ContentWithFileNameWritable) this.value).getValue()).set(this.docBuilder.getDoc());
            }
            return true;
        } catch (RuntimeException e2) {
            if (!e2.getMessage().contains("invalid char between encapsulated token and delimiter")) {
                throw e2;
            }
            setSkipKey(0, 0, "invalid char between encapsulated token and delimiter");
            if (!this.parserIterator.hasNext()) {
                return true;
            }
            this.parserIterator.next();
            return true;
        }
    }

    @Override // com.marklogic.contentpump.DelimitedTextReader
    protected void initParser(InputSplit inputSplit) throws IOException, InterruptedException {
        this.fileIn = openFile(inputSplit, true);
        if (this.fileIn == null) {
            return;
        }
        this.fields = ((DelimitedSplit) inputSplit).getHeader().toStrings();
        try {
            this.docBuilder.configFields(this.conf, this.fields);
            this.lineSeparator = retrieveLineSeparator(this.fileIn);
            long j = this.start;
            try {
                j = adjustStartPos();
            } catch (IllegalArgumentException e) {
                LOG.error("File: " + this.file.toUri() + " may lose records, reason: " + e.getMessage());
            }
            this.fileIn.seek(j);
            this.instream = new InputStreamReader((InputStream) this.fileIn, this.encoding);
            this.bytesRead = 0L;
            this.fileLen = inputSplit.getLength();
            if (this.uriName == null) {
                this.generateId = this.conf.getBoolean(ConfigConstants.CONF_INPUT_GENERATE_URI, false);
                if (this.generateId) {
                    this.idGen = new IdGenerator(this.file.toUri().getPath() + "-" + ((FileSplit) inputSplit).getStart());
                } else {
                    this.uriId = 0;
                }
            }
            boolean z = this.generateId || this.uriId == 0;
            int i = 0;
            while (true) {
                if (i >= this.fields.length || z) {
                    break;
                }
                if (this.fields[i].equals(this.uriName)) {
                    this.uriId = i;
                    z = true;
                    break;
                }
                i++;
            }
            if (!z) {
                LOG.error("Skipped file: " + this.file.toUri() + ", reason: uri_id " + this.uriName + " is not found");
                return;
            }
            this.parser = new CSVParser(this.instream, CSVParserFormatter.getFormat(this.delimiter, null, false, false), j, 0L, this.encoding);
            this.parserIterator = this.parser.iterator();
            if (this.parserIterator.hasNext()) {
                getLine();
                if (this.parserIterator.hasNext()) {
                    long characterByte = getRecordLine().getCharacterByte();
                    this.fileIn.seek(characterByte);
                    this.instream = new InputStreamReader((InputStream) this.fileIn, this.encoding);
                    this.parser = new CSVParser(this.instream, CSVParserFormatter.getFormat(this.delimiter, '\"', false, false), characterByte, 0L, this.encoding);
                    this.parserIterator = this.parser.iterator();
                }
            }
        } catch (IllegalArgumentException e2) {
            LOG.error("Skipped file: " + this.file.toUri() + ", reason: " + e2.getMessage());
        }
    }

    private String retrieveLineSeparator(FSDataInputStream fSDataInputStream) throws IOException {
        char read;
        while (fSDataInputStream.available() > 0) {
            char read2 = (char) fSDataInputStream.read();
            if (read2 == '\n' || read2 == '\r') {
                String str = "" + read2;
                if (fSDataInputStream.available() > 0 && ((read = (char) fSDataInputStream.read()) == '\r' || read == '\n')) {
                    str = str + read;
                }
                return str;
            }
        }
        return null;
    }

    @Override // com.marklogic.contentpump.DelimitedTextReader
    protected String convertToLine(String[] strArr) {
        StringBuilder sb = new StringBuilder();
        for (int i = 0; i < strArr.length; i++) {
            if (i == strArr.length - 1 && strArr[i].trim().equals("")) {
                sb.append(this.lineSeparator);
                return sb.toString();
            }
            sb.append(strArr[i]);
            sb.append(this.delimiter);
        }
        sb.replace(sb.length() - 1, sb.length(), this.lineSeparator);
        return sb.toString();
    }

    private long adjustStartPos() throws IllegalArgumentException, IOException {
        if (this.start != 0 && this.encoding.equals("UTF-8")) {
            return findStartPosUTF8();
        }
        return this.start;
    }

    private long findStartPosUTF8() throws IllegalArgumentException, IOException {
        for (long j = this.start; this.fileIn.available() > 0 && j > 0; j--) {
            if (this.start - j >= 4) {
                throw new IllegalArgumentException("Invalid UTF-8 encoding");
            }
            this.fileIn.seek(j - 1);
            int read = this.fileIn.read();
            if (checkFirstByteUTF8(this.fileIn.read())) {
                return (read == 10 || read == 13) ? j - 1 : j + delimiterOffsetUTF8(this.delimiter, j);
            }
        }
        return this.start;
    }

    private boolean checkFirstByteUTF8(int i) throws IllegalArgumentException {
        if (i >= 128 && i <= 191) {
            return false;
        }
        if (i >= 0 && i <= 127) {
            return true;
        }
        if (i < 192 || i > 247) {
            throw new IllegalArgumentException("Invalid UTF-8 encoding");
        }
        return true;
    }

    private int delimiterOffsetUTF8(char c, long j) throws IOException {
        byte[] bytes = String.valueOf(c).getBytes(StandardCharsets.UTF_8);
        this.fileIn.seek(j);
        for (byte b : bytes) {
            if (this.fileIn.available() <= 0 || this.fileIn.read() != b) {
                return 0;
            }
        }
        return bytes.length;
    }
}
