package org.talend.dataquality.semantic.statistics;

import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.avro.AvroRuntimeException;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.generic.IndexedRecord;
import org.apache.commons.lang3.StringUtils;
import org.eclipse.osgi.internal.loader.BundleLoader;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.talend.dataquality.common.inference.AvroAnalyzer;
import org.talend.dataquality.common.util.AvroUtils;
import org.talend.dataquality.semantic.index.Index;
import org.talend.dataquality.semantic.model.DQCategory;
import org.talend.dataquality.semantic.recognizer.DefaultCategoryRecognizer;
import org.talend.dataquality.semantic.snapshot.DictionarySnapshot;
import org.talend.dataquality.semantic.statistics.model.AvroSemanticCategory;

/* loaded from: input_file:org/talend/dataquality/semantic/statistics/AvroSemanticAnalyzer.class */
public class AvroSemanticAnalyzer implements AvroAnalyzer {
    public static final String DISCOVERY_AGGREGATE_PROP_NAME = "talend.component.semanticType";
    public static final String DISCOVERY_TOTAL_FIELD_NAME = "total";
    public static final String DISCOVERY_MATCHING_FIELD_NAME = "matching";
    public static final String DISCOVERY_TOP_RANKED_FIELD_NAME = "topRanked";
    public static final String DISCOVERY_MATCHING_ID_PROP_NAME = "id";
    public static final String DISCOVERY_MATCHING_NAME_PROP_NAME = "name";
    public static final String DISCOVERY_MATCHING_TOTAL_PROP_NAME = "total";
    public static final String DISCOVERY_MATCHING_SCORE_PROP_NAME = "frequency";
    public static final String DISCOVERY_MATCHING_DEPTH_PROP_NAME = "depth";
    private final DictionarySnapshot dictionarySnapshot;
    private Schema semanticSchema;
    private Schema globalDiscoverySchema;
    private Schema recordDiscoverySchema;
    private static final Logger LOGGER = LoggerFactory.getLogger(AvroSemanticAnalyzer.class);
    private static final String SEM_DISCOVERY_SCHEMA_DEF = "{\"type\": \"record\",\"name\": \"discovery_metadata\", \"namespace\": \"org.talend.dataquality\",\"fields\":[{ \"type\":\"string\", \"name\":\"matching\"}, { \"type\":\"int\", \"name\":\"total\"}]}";
    public static final Schema SEM_DISCOVERY_SCHEMA = new Schema.Parser().parse(SEM_DISCOVERY_SCHEMA_DEF);
    protected final Map<String, SemanticType> discoveryResults = new HashMap();
    private final Map<String, DefaultCategoryRecognizer> categoryRecognizers = new HashMap();

    public AvroSemanticAnalyzer(DictionarySnapshot dictionarySnapshot) {
        this.dictionarySnapshot = (DictionarySnapshot) Objects.requireNonNull(dictionarySnapshot, "Dictionary dictionarySnapshot is null.");
    }

    @Override // org.talend.dataquality.common.inference.AvroAnalyzer
    public void init() {
        this.discoveryResults.clear();
        this.categoryRecognizers.clear();
        Index customDataDict = this.dictionarySnapshot.getCustomDataDict();
        if (customDataDict != null) {
            customDataDict.initIndex();
        }
    }

    @Override // org.talend.dataquality.common.inference.AvroAnalyzer
    public void init(Schema schema) {
        init();
        initResultSchema(AvroUtils.dereferencing(AvroUtils.cleanSchema(schema, Collections.singletonList(DISCOVERY_AGGREGATE_PROP_NAME))));
    }

    private void initResultSchema(Schema schema) {
        this.semanticSchema = schema;
        this.globalDiscoverySchema = AvroUtils.copySchema(this.semanticSchema);
        this.recordDiscoverySchema = AvroUtils.createRecordSemanticSchema(this.semanticSchema, SEM_DISCOVERY_SCHEMA);
    }

    @Override // org.talend.dataquality.common.inference.AvroAnalyzer
    public boolean analyze(IndexedRecord indexedRecord) {
        analyzeRecord(indexedRecord);
        return true;
    }

    @Override // org.talend.dataquality.common.inference.AvroAnalyzer
    public Stream<IndexedRecord> analyze(Stream<IndexedRecord> stream) {
        return ((Stream) stream.sequential()).map(this::analyzeRecord);
    }

    private IndexedRecord analyzeRecord(IndexedRecord indexedRecord) {
        if (indexedRecord == null) {
            return null;
        }
        if (this.semanticSchema == null) {
            initResultSchema(indexedRecord.getSchema());
        }
        GenericData.Record record = new GenericData.Record(this.recordDiscoverySchema);
        analyzeRecord("", indexedRecord, record, this.semanticSchema);
        return record;
    }

    private void analyzeRecord(String str, IndexedRecord indexedRecord, GenericRecord genericRecord, Schema schema) {
        for (Schema.Field field : indexedRecord.getSchema().getFields()) {
            Schema.Field field2 = genericRecord.getSchema().getField(field.name());
            Schema.Field field3 = schema.getField(field.name());
            if (field2 == null) {
                LOGGER.error(field.name() + " field is missing from result record schema.");
            } else if (field3 != null) {
                genericRecord.put(field.name(), analyzeItem(AvroUtils.itemId(str, field.name()), indexedRecord.get(field.pos()), field.schema(), field2.schema(), field3.schema()));
            } else {
                LOGGER.error(field.name() + " field is missing from semantic schema.");
            }
        }
    }

    private Object analyzeItem(String str, Object obj, Schema schema, Schema schema2, Schema schema3) {
        switch (schema.getType()) {
            case RECORD:
                GenericData.Record record = new GenericData.Record(schema2);
                analyzeRecord(str, (GenericRecord) obj, record, schema3);
                return record;
            case ARRAY:
                return new GenericData.Array(schema2, (List) ((List) obj).stream().map(obj2 -> {
                    return analyzeItem(str, obj2, schema.getElementType(), schema2.getElementType(), schema3.getElementType());
                }).collect(Collectors.toList()));
            case MAP:
                return ((Map) obj).entrySet().stream().collect(Collectors.toMap((v0) -> {
                    return v0.getKey();
                }, entry -> {
                    return analyzeItem(str, entry.getValue(), schema.getValueType(), schema2.getValueType(), schema3.getValueType());
                }));
            case UNION:
                int resolveUnion = new GenericData().resolveUnion(schema, obj);
                Schema schema4 = schema.getTypes().get(resolveUnion);
                return analyzeItem(AvroUtils.itemId(str, schema4.getName()), obj, schema4, schema2.getTypes().stream().filter(schema5 -> {
                    return schema5.getName().equals(schema4.getName());
                }).findFirst().orElse(SEM_DISCOVERY_SCHEMA), schema3.getTypes().get(resolveUnion));
            case ENUM:
            case FIXED:
            case STRING:
            case BYTES:
            case INT:
            case LONG:
            case FLOAT:
            case DOUBLE:
            case BOOLEAN:
                GenericData.Record record2 = new GenericData.Record(SEM_DISCOVERY_SCHEMA);
                record2.put(DISCOVERY_MATCHING_FIELD_NAME, analyzeLeafValue(str, obj));
                return record2;
            case NULL:
                GenericData.Record record3 = new GenericData.Record(SEM_DISCOVERY_SCHEMA);
                record3.put(DISCOVERY_MATCHING_FIELD_NAME, analyzeLeafValue(str, obj));
                return record3;
            default:
                throw new IllegalStateException("Unexpected Avro Schema.Type: " + schema.getType());
        }
    }

    private Object analyzeLeafValue(String str, Object obj) {
        return Arrays.stream(this.categoryRecognizers.computeIfAbsent(str, str2 -> {
            return new DefaultCategoryRecognizer(this.dictionarySnapshot);
        }).process(obj == null ? "" : obj.toString())).map(str3 -> {
            DQCategory categoryMetadataByName = this.dictionarySnapshot.getCategoryMetadataByName(str3);
            return new AvroSemanticCategory(categoryMetadataByName.getId(), categoryMetadataByName.getName());
        }).collect(Collectors.toList());
    }

    @Override // org.talend.dataquality.common.inference.AvroAnalyzer
    public Schema getResult() {
        if (this.semanticSchema == null) {
            throw new IllegalStateException("The analyzer should be initialized first.");
        }
        for (Schema.Field field : this.globalDiscoverySchema.getFields()) {
            updateDiscovery(field.schema(), field.name());
        }
        return this.globalDiscoverySchema;
    }

    private void updateDiscovery(Schema schema, String str) {
        switch (schema.getType()) {
            case RECORD:
                for (Schema.Field field : schema.getFields()) {
                    updateDiscovery(field.schema(), AvroUtils.itemId(str, field.name()));
                }
                return;
            case ARRAY:
                updateDiscovery(schema.getElementType(), str);
                return;
            case MAP:
                updateDiscovery(schema.getValueType(), str);
                return;
            case UNION:
                if (this.discoveryResults.containsKey(str)) {
                    try {
                        schema.addProp(DISCOVERY_AGGREGATE_PROP_NAME, getGlobalFrequencies(str));
                    } catch (AvroRuntimeException e) {
                        LOGGER.error("Failed to add prop to field " + schema.getName() + BundleLoader.DEFAULT_PACKAGE);
                    }
                }
                for (Schema schema2 : schema.getTypes()) {
                    updateDiscovery(schema2, AvroUtils.itemId(str, schema2.getName()));
                }
                return;
            case ENUM:
            case FIXED:
            case STRING:
            case BYTES:
            case INT:
            case LONG:
            case FLOAT:
            case DOUBLE:
            case BOOLEAN:
            case NULL:
                try {
                    schema.addProp(DISCOVERY_AGGREGATE_PROP_NAME, getGlobalFrequencies(str));
                    return;
                } catch (AvroRuntimeException e2) {
                    LOGGER.error("Failed to add prop to referenced type " + schema.getName() + ". The analyzer is not supporting schema with referenced types.");
                    return;
                }
            default:
                return;
        }
    }

    private Map<String, Object> getGlobalFrequencies(String str) {
        DefaultCategoryRecognizer defaultCategoryRecognizer = this.categoryRecognizers.get(str);
        if (defaultCategoryRecognizer == null) {
            return Collections.emptyMap();
        }
        List list = (List) defaultCategoryRecognizer.getResult(str, 0.1f).stream().filter(categoryFrequency -> {
            return StringUtils.isNotEmpty(categoryFrequency.getCategoryName());
        }).map(categoryFrequency2 -> {
            HashMap hashMap = new HashMap();
            hashMap.put(DISCOVERY_MATCHING_ID_PROP_NAME, categoryFrequency2.getCategoryId());
            hashMap.put("name", categoryFrequency2.getCategoryName());
            hashMap.put("total", Long.valueOf(categoryFrequency2.getCount()));
            hashMap.put(DISCOVERY_MATCHING_SCORE_PROP_NAME, Float.valueOf(categoryFrequency2.getScore()));
            hashMap.put(DISCOVERY_MATCHING_DEPTH_PROP_NAME, Integer.valueOf(categoryFrequency2.getCategoryLevel()));
            return hashMap;
        }).collect(Collectors.toList());
        HashMap hashMap = new HashMap();
        hashMap.put(DISCOVERY_MATCHING_FIELD_NAME, list);
        hashMap.put("total", Long.valueOf(defaultCategoryRecognizer.getTotal()));
        if (!list.isEmpty()) {
            hashMap.put(DISCOVERY_TOP_RANKED_FIELD_NAME, list.get(0));
        }
        return hashMap;
    }

    @Override // org.talend.dataquality.common.inference.AvroAnalyzer
    public List<Schema> getResults() {
        return Collections.singletonList(getResult());
    }

    @Override // java.lang.AutoCloseable
    public void close() {
        this.categoryRecognizers.values().forEach((v0) -> {
            v0.end();
        });
    }
}
