关联算法Apriori的java实现,数据库使用redis

该算法是为了实现对一些专业文章的词汇关联分析而实现的,并不是Apriori的最佳应用,确实对词频分析的一种实践。
package com.my.analysis;

import java.util.ArrayList;
import java.util.HashSet;
import java.util.Set;

import redis.clients.jedis.Jedis;

public class AprioriMyImpl {
	private double minsup = 0.3;// 最小支持度
	private double minconf = 0.99;// 最小置信度
	private int limitword = 100;// 参加统计的
	
	private ArrayList<Set<String>> aricleWL;//
	
	private ArrayList<Set<Set<String>>> candidateList;//候选项list
	private ArrayList<Set<Set<String>>> frequencyList;//频繁项list
	
	public Set<Set<String>> allSub = new HashSet<Set<String>>();//最大频繁项的所有子集
	
	private long filecount;//文件的总数量
	
	private int step = 1;//表示进行到第一步了
	
	private Jedis jedis = new Jedis("localhost", 6379);
	
	
	public AprioriMyImpl() {
		candidateList = new ArrayList<Set<Set<String>>>();
		frequencyList = new ArrayList<Set<Set<String>>>();
		aricleWL = new ArrayList<Set<String>>();
		
		filecount = jedis.llen(AnsjTxtFileParserForRedis.FILELIST);
		for(int i = 0;i < filecount;i++){
			aricleWL.add(jedis.smembers(AnsjTxtFileParserForRedis.FILEPREFIX+i));
		}
		
	}
	
	/**
	 * 初始化第一个候选项集合
	 */
//	public void item1_init(){
//		Set<Set<String>> candidate1 = new HashSet<Set<String>>();
//		Set<String> tset = jedis.zrevrange(AnsjTxtFileParserForRedis.TABLENAME, 0,limitword-1);
//		for(String s:tset){
//			HashSet<String> one = new HashSet<String>();
//			one.add(s);
//			candidate1.add(one);
//		}
//		candidateList.add(candidate1);
//		System.out.println("候选项集-"+(step)+":");
//		printSetSetString(candidate1);
//	}
	
	public void item1_init(){
		String[] keys ={"睡眠","时间","宝宝","治疗","疾病","身体","呼吸","质量","孩子","入睡","人体","精神","习惯","心理","障碍","枕头","保健","关注","医生","女性","症状","食物","饮食","运动","中医","床垫","儿童","婴儿","阅读","大脑","按摩","效果","癫痫","环境","营养","压力","血液","智能","休息","妈妈","男人","生理","医学","社会","药物","肌肉","男性","科技","恢复","减肥","放松","神经","危害","情绪","怀孕","午睡","分泌","下降","反馈","音乐","刺激","糖尿病","姿势","老人","熬夜","消化","记忆","消除","起床","客户","食品","感冒","高血压","招聘","老年人","孕妇","手表","解决","现象","超过","颈椎","全身","空调","侧卧","位置","体温","金笔","达到","打鼾","电视","能量","催眠","物质","状况","精力","作者","设备","价格","病人","保护","数据","经验","正文","适合","妇科","锻炼","新生儿","咳嗽","抑郁症","血管","抑制","幼儿","失眠症","心脏病","食疗","血压","肿瘤","诱发","重视","心血管","寿命","小便","免疫力","月经","评测","记忆力","智力"};
		Set<Set<String>> candidate1 = new HashSet<Set<String>>();
		for(String s:keys){
			HashSet<String> one = new HashSet<String>();
			one.add(s);
			candidate1.add(one);
		}
		candidateList.add(candidate1);
		System.out.println("候选项集-"+(step)+":");
		printSetSetString(candidate1);
	}
	/**
	 * 候选项集转化为频繁项集
	 */
	public boolean candidateToFrequency(){
		Set<Set<String>> candItems = candidateList.get(step-1);
		Set<Set<String>> freqItems = new HashSet<Set<String>>();
		for(Set<String> item:candItems){
			if((count_sup(item)/filecount)>=minsup){
				freqItems.add(item);
			}
		}
		if(freqItems.size()==0){//无法产生符合条件的频繁项集
			return false;
		}
		frequencyList.add(freqItems);
		System.out.println("频繁项集-"+(step)+":");
		printSetSetString(freqItems);//输出频繁项集
		step++;
		return true;
	}
	/**
	 * 频繁项集形成新的候选项集
	 */
	public boolean frequencyToCandidate(){
		Set<Set<String>> frequencyItems = frequencyList.get(step-2);
		Set<String> maxSub = maxSubSet(frequencyItems);
		Set<Set<String>> candidateItems = new HashSet<Set<String>>();
		for(Set<String> freqs : frequencyItems){
			int len = freqs.size();
			for(String sub:maxSub){
				Set<String> pItem = new HashSet<String>();
				pItem.addAll(freqs);
				pItem.add(sub);
				if(pItem.size()==(len+1)&&subIsFreq(frequencyItems,pItem)){
					candidateItems.add(pItem);
				}
			}
		}
		if(candidateItems.size()==0){//没有形成新的候选集
			return false;
		}
		candidateList.add(candidateItems);
		System.out.println("候选项集-"+(step)+":");
		printSetSetString(candidateItems);//输出频繁项集
		return true;
	}
	
	/**
	 * parentSet的子集在频繁集合freq中
	 * @param freq
	 * @param parentSet
	 * @return true 是 ; false 否
	 */
	public boolean subIsFreq(Set<Set<String>> freq,Set<String> parentSet){
		for(String s:parentSet){
			Set<String> item = new HashSet<String>();
			item.addAll(parentSet);
			item.remove(s);
			if(!freq.contains(item)){
				return false;
			}
		}
		return true;
	}
	/**
	 * 获得频繁项集的最大项集
	 * @param freqIntems
	 */
	public Set<String> maxSubSet(Set<Set<String>> freqIntems){
		Set<String> maxSub = new HashSet<String>();
		for(Set<String> ss:freqIntems){
			for(String s:ss){
				maxSub.add(s);
			}
		}
		return maxSub;
	}
	
	/**
	 * 计算支持度
	 * @param x
	 * @return
	 */
	public double count_sup(Set<String> x){
		int temp = 0;
		for(Set<String> ss:aricleWL){
			if(ss.containsAll(x)){
				temp++;
			}
		}
		return temp;
	}
	/**
	 * 计算集合x=>y的置信度
	 * @param x
	 * @param y
	 * @return
	 */
	public double cout_cand(Set<String> x,Set<String> y){
		Set<String> z = new HashSet<String>();
		z.addAll(x);
		z.addAll(y);
		return count_sup(z)/count_sup(x);
	}

	/**
	 * 获得所有的子集
	 * @param parent
	 */
	public void genSub(Set<String> parent){
		if(parent.size()>0){
			allSub.add(parent);
		}
		Set<String> ss = new HashSet<String>();
		ss.addAll(parent);
		for(String s:ss){
			Set<String> ss2 = new HashSet<String>();
			ss2.addAll(ss);
			ss2.remove(s);
			genSub(ss2);
		}
	}
	
	/**
	 * 输出
	 * @param sss
	 */
	public void printSetSetString(Set<Set<String>> sss){
		for(Set<String> ss:sss){
			System.out.println(ss);
		}
	}
	/**
	 * 关联度分析
	 * @param subSet
	 */
	public void releRuleCount(Set<Set<String>> subSet){
		for(Set<String> x:subSet){
			for(Set<String> y:subSet){
				Set<String> xy = new HashSet<String>();
				xy.addAll(x);
				xy.addAll(y);
				if(xy.size()==(x.size()+y.size())){
					double sup_count = cout_cand(x,y);
					if(sup_count>minconf){
						System.out.println(x+"==>>"+y+"=="+sup_count);
					}
				}
			}
		}
	}
	
	public void jisuan(){
		item1_init();//第一个候选项集的初始化
		while(true){
			if(!candidateToFrequency())
				break;
			if(!frequencyToCandidate())
				break;
		}
		Set<Set<String>> maxfreqs = frequencyList.get(frequencyList.size()-1);
		for(Set<String> maxfreq:maxfreqs){
			allSub = new HashSet<Set<String>>();
			genSub(maxfreq);
			releRuleCount(allSub);
		}
	}
	
	public static void main(String[] args) {
		//初始化候选项,取前几位word
		new AprioriMyImpl().jisuan();
	}
	
}

郑重声明:本站内容如果来自互联网及其他传播媒体,其版权均属原媒体及文章作者所有。转载目的在于传递更多信息及用于网络分享,并不代表本站赞同其观点和对其真实性负责,也不构成任何其他建议。