// hash data of corpus to deduplication (read and save) var hashFile = PropertiesReader.get("dedeplication_hash_path");
//1.rule var ruleQuality = new RuleQuality();
//2.sensitivity and advertising detection var simpleSenDetectionProcessor = SimpleSenDetectionProcessor.newInstance(); var senDetection = simpleSenDetectionProcessor.getKWSeeker("sensitive_words_path");
var ad_detect_model_path = PropertiesReader.get("ad_detect_model_path"); var ad_dict_path = PropertiesReader.get("ad_dict_path"); var stop_words_path = PropertiesReader.get("stop_words_path"); var adDetection = new AdDetection(ad_detect_model_path, ad_dict_path, stop_words_path);
//3.text deduplication var deDuplication = new DeDuplication(4, 3);
//4.quality evaluation var ngramModelPath = PropertiesReader.get("language_model_path"); var qualityEvaluation = new QualityEvaluation(ngramModelPath);
var corpusQuality = new CorpusQuality(ruleQuality, senDetection, adDetection, deDuplication, qualityEvaluation, 100); var corpus = "对未按土地、环保和投资管理等法律法规履行相关手续或手续不符合规定的违规项目,地方政府要按照要求进行全面清理。一,凡是未开工的违规项目,一律不得开工建设;二,凡是不符合产业政策、准入标准、环保要求的违规项目一律停建。"; var result = corpusQuality.quality(corpus); System.out.println(result);