Explorar el Código

퀴즈 크롤링 코드 분석

tobby48 hace 5 años
padre
commit
7e00ee3e76

+ 18
- 0
pom.xml Ver fichero

@@ -196,6 +196,24 @@
196 196
 		 
197 197
 		<!-- CLIENT SIDE -->
198 198
 		
199
+		<!-- https://mvnrepository.com/artifact/org.apache.commons/commons-lang3 -->
200
+		<dependency>
201
+		    <groupId>org.apache.commons</groupId>
202
+		    <artifactId>commons-lang3</artifactId>
203
+		    <version>3.9</version>
204
+		</dependency>
205
+		<dependency>
206
+			<groupId>org.scala-lang</groupId>
207
+			<artifactId>scala-library</artifactId>
208
+			<version>2.12.7</version>
209
+		</dependency>
210
+		
211
+		<dependency>
212
+		    <groupId>com.ibm.icu</groupId>
213
+			<artifactId>icu4j</artifactId>
214
+			<version>63.1</version>
215
+		</dependency>
216
+		
199 217
 	</dependencies>
200 218
 	
201 219
 	<build>

+ 331
- 0
src/main/java/kr/co/swh/lecture/opensource/icu/HolidayUtils.java Ver fichero

@@ -0,0 +1,331 @@
1
+package kr.co.swh.lecture.opensource.icu;
2
+
3
+import java.text.ParseException;
4
+import java.text.SimpleDateFormat;
5
+import java.util.ArrayList;
6
+import java.util.Arrays;
7
+import java.util.Calendar;
8
+import java.util.Date;
9
+import java.util.HashMap;
10
+import java.util.Iterator;
11
+import java.util.List;
12
+import java.util.Map;
13
+import java.util.SortedSet;
14
+import java.util.TreeMap;
15
+import java.util.TreeSet;
16
+
17
+import com.google.common.collect.Lists;
18
+import com.ibm.icu.util.ChineseCalendar;
19
+
20
+import scala.Tuple2;
21
+ 
22
+public class HolidayUtils {
23
+ 
24
+    public Map<String, String> solarHolidayMap = new HashMap<String, String>();
25
+    public Map<String, String> lunarHolidayMap = new HashMap<String, String>();
26
+ 
27
+    public String getDateByString(Date date) {
28
+        return getDateByString(date, "-");
29
+    }
30
+ 
31
+    public String getDateByString(Date date, String separator) {
32
+        SimpleDateFormat sdf = new SimpleDateFormat("yyyy"+separator+"MM"+separator+"dd");
33
+        return sdf.format(date);
34
+    }
35
+ 
36
+    /**
37
+     * 양력날짜를 음력날짜로 변환
38
+     * @param 양력날짜 (yyyyMMdd)
39
+     * @return 음력날짜 (yyyyMMdd)
40
+     */
41
+//    private String converSolarToLunar(String date) {
42
+//        return converSolarToLunar(date, "-");
43
+//    }
44
+ 
45
+    private String converSolarToLunar(String date, String separator) {
46
+        ChineseCalendar cc = new ChineseCalendar();
47
+        Calendar cal = Calendar.getInstance();
48
+ 
49
+        cal.set(Calendar.YEAR, Integer.parseInt(date.substring(0, 4)));
50
+        cal.set(Calendar.MONTH, Integer.parseInt(date.substring(4, 6)) - 1);
51
+        cal.set(Calendar.DAY_OF_MONTH, Integer.parseInt(date.substring(6)));
52
+ 
53
+        cc.setTimeInMillis(cal.getTimeInMillis());
54
+ 
55
+        int y = cc.get(ChineseCalendar.EXTENDED_YEAR) - 2637;
56
+        int m = cc.get(ChineseCalendar.MONTH) + 1;
57
+        int d = cc.get(ChineseCalendar.DAY_OF_MONTH);
58
+ 
59
+        StringBuffer ret = new StringBuffer();
60
+        ret.append(String.format("%04d", y)).append(separator);
61
+        ret.append(String.format("%02d", m)).append(separator);
62
+        ret.append(String.format("%02d", d));
63
+ 
64
+        return ret.toString();
65
+    } // end converSolarToLunar
66
+ 
67
+    public String getDay(String date, int amount) {
68
+        Calendar cal = Calendar.getInstance();
69
+        cal.set(Integer.parseInt(date.substring(0, 4)), Integer.parseInt(date.substring(4, 6)) - 1, Integer.parseInt(date.substring(6)));
70
+        cal.add(Calendar.DAY_OF_MONTH, amount);
71
+ 
72
+        return getDateByString(cal.getTime(), "");
73
+    }
74
+    
75
+    /**
76
+     * 해당일자가 대체공휴일에 해당하는 지 확인
77
+     * @param 양력날짜 (yyyyMMdd)
78
+     * @return 대체 공휴일이면 true
79
+     */
80
+    private boolean isHolidayAlternate(String date) {
81
+        
82
+        String[] altHoliday = new String[] {
83
+                "20150929", "20160210", "20170130", "20180926", 
84
+                "20180507", "20190506", "20200127", "20220912", 
85
+                "20230124", "20240212", "20240506", "20251008", 
86
+                "20270209", "20290924", "20290507"}; 
87
+        
88
+        return Arrays.asList(altHoliday).contains(date); 
89
+        
90
+        /*
91
+        int year = Integer.parseInt(date.substring(0, 4));
92
+        SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMdd");
93
+        
94
+        // 설날
95
+        String dayFirst2 = convertLunarToSolar(year + "0101");
96
+        String dayFirst3 = convertLunarToSolar(year + "0102");
97
+        String dayFirst1 = String.valueOf(Integer.parseInt(dayFirst2) - 1);     
98
+        
99
+        // 추석
100
+        String dayThanks1 = convertLunarToSolar(year + "0814");
101
+        String dayThanks2 = convertLunarToSolar(year + "0815");
102
+        String dayThanks3 = convertLunarToSolar(year + "0816");
103
+        
104
+        // 어린이날
105
+        String dayChild = year + "0505";
106
+        
107
+        // 해당 년도의 대체휴일 목록
108
+        List<String> altHolyday = new ArrayList<String>();
109
+        
110
+        if(getDayOfWeek(dayFirst1) == Calendar.SUNDAY || getDayOfWeek(dayFirst2) == Calendar.SUNDAY || getDayOfWeek(dayFirst3) == Calendar.SUNDAY || isHolidaySolar(dayFirst1) || isHolidaySolar(dayFirst2) || isHolidaySolar(dayFirst3)) {
111
+            int y = Integer.parseInt(dayFirst3.substring(0, 4));
112
+            int m = Integer.parseInt(dayFirst3.substring(4, 6)) - 1;
113
+            int d = Integer.parseInt(dayFirst3.substring(6)) + 1;
114
+            Calendar c = Calendar.getInstance();
115
+            c.set(y, m, d);
116
+            altHolyday.add(sdf.format(c.getTime()));
117
+        }
118
+            
119
+        if(getDayOfWeek(dayThanks1) == Calendar.SUNDAY || getDayOfWeek(dayThanks2) == Calendar.SUNDAY || getDayOfWeek(dayThanks3) == Calendar.SUNDAY || isHolidaySolar(dayThanks1) || isHolidaySolar(dayThanks2) || isHolidaySolar(dayThanks3)) {
120
+            int y = Integer.parseInt(dayThanks3.substring(0, 4));
121
+            int m = Integer.parseInt(dayThanks3.substring(4, 6)) - 1;
122
+            int d = Integer.parseInt(dayThanks3.substring(6)) + 1;
123
+            Calendar c = Calendar.getInstance();
124
+            c.set(y, m, d);
125
+            altHolyday.add(sdf.format(c.getTime()));
126
+        }
127
+        
128
+        int childWeek = getDayOfWeek(dayChild); 
129
+        
130
+        if(childWeek == Calendar.SATURDAY) {
131
+            int y = Integer.parseInt(dayChild.substring(0, 4));
132
+            int m = Integer.parseInt(dayChild.substring(4, 6)) - 1;
133
+            int d = Integer.parseInt(dayChild.substring(6)) + 2;
134
+            Calendar c = Calendar.getInstance();
135
+            c.set(y, m, d);
136
+            altHolyday.add(sdf.format(c.getTime()));
137
+        }
138
+            
139
+        if(childWeek == Calendar.SUNDAY) {
140
+            int y = Integer.parseInt(dayChild.substring(0, 4));
141
+            int m = Integer.parseInt(dayChild.substring(4, 6)) - 1;
142
+            int d = Integer.parseInt(dayChild.substring(6)) + 1;
143
+            Calendar c = Calendar.getInstance();
144
+            c.set(y, m, d);
145
+            altHolyday.add(sdf.format(c.getTime()));
146
+        }
147
+        
148
+        return altHolyday.contains(date); 
149
+        */
150
+    }
151
+    
152
+    public HashMap<String, Tuple2<String, String>> init() {
153
+    	HashMap<String, Tuple2<String, String>> holidays = new HashMap<String, Tuple2<String, String>>();	//	선생님이 추가한 코드
154
+    	SortedSet<String> sortHoliday = new TreeSet<String>();
155
+    	solarHolidayMap.put("0101", "신정"); solarHolidayMap.put("0301", "삼일절");
156
+        solarHolidayMap.put("0505", "어린이날"); solarHolidayMap.put("0606", "현충일");
157
+        solarHolidayMap.put("0815", "광복절"); solarHolidayMap.put("1003", "개천절");
158
+        solarHolidayMap.put("1009", "한글날"); solarHolidayMap.put("1225", "성탄절");
159
+ 
160
+        lunarHolidayMap.put("0101", "구정"); lunarHolidayMap.put("0102", "구정");
161
+        lunarHolidayMap.put("0408", "석가탄신일"); lunarHolidayMap.put("0814", "추석");
162
+        lunarHolidayMap.put("0815", "추석"); lunarHolidayMap.put("0816", "추석");
163
+ 
164
+        int year = 2015;
165
+        int endYear = 2030;
166
+ 
167
+        Calendar c = Calendar.getInstance();
168
+        c.set(year, 0, 1); // 1월 1일부터 시작
169
+        
170
+        String solarDate = "";
171
+        String lunarDate = "";
172
+        for(int i=year; i<=endYear;) {
173
+            solarDate = getDateByString(c.getTime(), "");
174
+            lunarDate = converSolarToLunar(solarDate, "");
175
+ 
176
+            c.add(Calendar.DAY_OF_MONTH, 1);
177
+ 
178
+            //	대체휴일 체크
179
+            if(isHolidayAlternate(solarDate)) {
180
+            	String day = solarDate.substring(0,4) + "-" + solarDate.substring(4,6) + "-" + solarDate.substring(6,8);
181
+                System.out.println(day + " ==> " + "대체공휴일");
182
+                holidays.put(day, new Tuple2<String, String>(day, "대체공휴일"));	//	선생님이 추가한 코드
183
+                sortHoliday.add(day);
184
+            }
185
+            
186
+            // 양력휴일 체크
187
+            String solarMmdd = solarDate.substring(4,8);
188
+            if(solarHolidayMap.containsKey(solarMmdd)) {
189
+            	String day = solarDate.substring(0,4) + "-" + solarDate.substring(4,6) + "-" + solarDate.substring(6,8);
190
+                System.out.println(day + " ==> " + solarHolidayMap.get(solarMmdd));
191
+                holidays.put(day, new Tuple2<String, String>(day, solarHolidayMap.get(solarMmdd)));	//	선생님이 추가한 코드
192
+                sortHoliday.add(day);
193
+            }
194
+ 
195
+            // 음력휴일 체크
196
+            String lunarMmdd = lunarDate.substring(4,8);
197
+            if(lunarHolidayMap.containsKey(lunarMmdd)) {
198
+                // 음력 12월은 마지막날이 29일, 30일 계속 번갈아가면서 있으므로
199
+                // 양력에서 하루를 빼준날이 구정시작하는 날짜이다.
200
+                if(lunarMmdd.equals("0101")) {
201
+                	String tmp = getDay(solarDate, -1);
202
+                	String day = tmp.substring(0,4) + "-" + tmp.substring(4,6) + "-" + tmp.substring(6,8);
203
+                	System.out.println(day + " ==> 구정");
204
+                	holidays.put(day, new Tuple2<String, String>(day, "구정"));	//	선생님이 추가한 코드
205
+                	sortHoliday.add(day);
206
+                }
207
+                String day = solarDate.substring(0,4) + "-" + solarDate.substring(4,6) + "-" + solarDate.substring(6,8);
208
+                System.out.println(day + " ==> " + lunarHolidayMap.get(lunarMmdd));
209
+                holidays.put(day, new Tuple2<String, String>(day, lunarHolidayMap.get(lunarMmdd)));	//	선생님이 추가한 코드
210
+                sortHoliday.add(day);
211
+            }
212
+            
213
+            year = c.get(Calendar.YEAR);
214
+            if(year != i) {
215
+                i++;
216
+                System.out.println("");
217
+            }
218
+            if(i > endYear) break;
219
+        } // end for_i
220
+        
221
+        
222
+        // 	Quiz. 연속된 휴일을 시작과 종료로 연결
223
+        
224
+        //	holidays에 있는 값 출력
225
+        Iterator<String> holidayKeys = holidays.keySet().iterator();
226
+        while(holidayKeys.hasNext()) {
227
+        	String key = holidayKeys.next();
228
+        	Tuple2<String, String> tuple = holidays.get(key);
229
+        	System.out.println(key + " ~ " + tuple._1() + " ==> "  + tuple._2());
230
+        }
231
+        return holidays;
232
+    }
233
+    
234
+    public static ArrayList<String> getCurrentWeek(String day) throws ParseException {
235
+    	ArrayList<String> list = new ArrayList<String>();
236
+		SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd");
237
+		Date date = format.parse(day);
238
+		
239
+		Calendar calendar = Calendar.getInstance();
240
+		calendar.setTime(date);
241
+		int dayOfWeekDay = calendar.get(Calendar.DAY_OF_WEEK) - calendar.getFirstDayOfWeek();
242
+		calendar.add(Calendar.DAY_OF_MONTH, -dayOfWeekDay);
243
+		
244
+		String[] days = {"일","월","화","수","목","금","토"};
245
+		for (int i = 0; i < days.length; i++) {
246
+			list.add(format.format(calendar.getTime()));
247
+			calendar.add(Calendar.DAY_OF_MONTH, 1);
248
+		}
249
+		return list;
250
+    }
251
+    
252
+    //	종료날짜는 포함하지 않는다.
253
+    public static List<String> getDateListEndDateNotInclude(String startDate, String endDate) throws ParseException{
254
+    	SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd");
255
+    	Calendar calendar = Calendar.getInstance();
256
+    	calendar.clear();
257
+    	List<String> holidayList = Lists.newArrayList();
258
+    	
259
+    	Date sDate = simpleDateFormat.parse(startDate);
260
+    	Date eDate = simpleDateFormat.parse(endDate);
261
+
262
+    	//	두 날짜의 차이를 계산
263
+    	int count = (int)( (eDate.getTime() - sDate.getTime()) / (1000 * 60 * 60 * 24));
264
+    	calendar.setTime(sDate);
265
+    	
266
+    	//	차이가 0 보다 큰 경우에는 시작날짜 추가
267
+    	if(count >= 0) holidayList.add(startDate);
268
+    	//	시작일과 종료일이 1보다 큰 경우에는 날짜를 모두 등록
269
+    	for(int dayCount = 1; dayCount <= count; dayCount++) {
270
+    		calendar.add(Calendar.DAY_OF_MONTH, 1);
271
+    		Date time = calendar.getTime();
272
+    		String day = simpleDateFormat.format(time);
273
+    		holidayList.add(day);
274
+    	}
275
+    	return holidayList;
276
+    }
277
+    
278
+    public static Tuple2<ArrayList<String>, Map<String, Integer>> getStartDayEndDay(List<String> holidays, String startDay, String endDay, String dayOfWeek) throws ParseException {
279
+    	
280
+		SimpleDateFormat originalFormat = new SimpleDateFormat("yyyy-MM-dd");
281
+		Date startDate = originalFormat.parse(startDay);
282
+		Date endDate = originalFormat.parse(endDay);		//	종료일짜를 출석일 제일 마지막 날짜로 초기화
283
+		Calendar calendar = Calendar.getInstance();
284
+		calendar.clear();
285
+		
286
+		//	종료 날짜는 출석일 제일 마지막 날짜의 다음 달 말일까지 계산
287
+		calendar.setTime(endDate);
288
+		calendar.add(Calendar.MONTH, 1);
289
+		int monthEndDay = calendar.getActualMaximum(Calendar.DAY_OF_MONTH);
290
+		calendar.set(Calendar.DATE, monthEndDay);
291
+		endDate = calendar.getTime();
292
+	
293
+		//	시작 날짜는 수강 시작일
294
+		calendar.setTime(startDate);
295
+//		시작 날짜가 포함된 주의 첫번째 날짜 계산
296
+//		calendar.setTime(startDate);
297
+//		int dayOfWeekDay = calendar.get(Calendar.DAY_OF_WEEK) - calendar.getFirstDayOfWeek();
298
+//		calendar.add(Calendar.DAY_OF_MONTH, -dayOfWeekDay);
299
+		
300
+		
301
+		//	두 날짜의 차이를 계산
302
+		int count = (int)( (endDate.getTime() - calendar.getTime().getTime()) / (1000 * 60 * 60 * 24));
303
+		
304
+		SimpleDateFormat mapFormat = new SimpleDateFormat("yyyy년 MM월");
305
+		Map<String, Integer> monthTotalCountMap = new TreeMap<String, Integer>();
306
+		ArrayList<String> subjectDays = new ArrayList<String>();
307
+		for (int i = 0; i <= count; i++) {
308
+			Date time = calendar.getTime();
309
+			String day = originalFormat.format(time);
310
+			
311
+			//	과정의 수강요일만 포함하며 휴강일은 카운트에서 제외
312
+			if(dayOfWeek != null && dayOfWeek.charAt(calendar.get(Calendar.DAY_OF_WEEK)-1) == '1' &&
313
+					!holidays.contains(day)){
314
+				
315
+				//	해당 월에 대한 총 카운트를 저장
316
+				String monthMap = mapFormat.format(time);
317
+				int monthMapKey = (monthTotalCountMap.containsKey(monthMap)) ? monthTotalCountMap.get(monthMap)+1 : 1;
318
+				monthTotalCountMap.put(monthMap, monthMapKey);
319
+				subjectDays.add(day);
320
+			}
321
+			calendar.add(Calendar.DAY_OF_MONTH, 1);
322
+		}
323
+		
324
+		return new Tuple2<ArrayList<String>, Map<String,Integer>>(subjectDays, monthTotalCountMap);
325
+    }
326
+    
327
+    public static void main(String[] args) {
328
+    	HolidayUtils holiday = new HolidayUtils();
329
+    	holiday.init();
330
+    }
331
+}

+ 237
- 0
src/main/java/kr/co/swh/lecture/opensource/jsoup/news/A_CrawlClient.java Ver fichero

@@ -0,0 +1,237 @@
1
+package kr.co.swh.lecture.opensource.jsoup.news;
2
+
3
+import java.text.SimpleDateFormat;
4
+import java.util.ArrayList;
5
+import java.util.Date;
6
+import java.util.List;
7
+import java.util.concurrent.CountDownLatch;
8
+
9
+import org.apache.commons.lang3.StringUtils;
10
+
11
+import com.fasterxml.jackson.databind.ObjectMapper;
12
+import com.google.common.collect.Lists;
13
+
14
+
15
+public abstract class A_CrawlClient implements ICrawlService{
16
+
17
+	protected transient ObjectMapper mapper;
18
+	protected transient CountDownLatch articleBreakLatch;
19
+	
20
+	protected String domainName, startDate, endDate;
21
+	protected String[] categories;
22
+	protected Integer sleepTime;
23
+	
24
+	protected List<String> cacheNewsId;
25
+	protected final String autoModeTag = "~";
26
+	protected boolean autoMonitor;
27
+	protected boolean continueCrawl;
28
+	protected String monitoringStartDate;					//	모니터링 시작 날짜
29
+	protected String currentStartDate;
30
+	
31
+	//	final value
32
+	protected final int monitoringNewPageBlockNum = 10;		//	모니터링 페이지 임계치
33
+	protected final int fastModeSleepTime = 0;				//	milisecond
34
+	protected final int safeModeSleepTime = 500;			//	milisecond
35
+	
36
+	//	pause
37
+	protected String lastProcessId;
38
+	
39
+	
40
+	public A_CrawlClient(String domain, String startDate, String endDate, String[] category, boolean fastMode)
41
+	{
42
+		this.mapper = new ObjectMapper();
43
+		this.articleBreakLatch = new CountDownLatch(1);
44
+		
45
+		this.domainName = domain.toLowerCase();
46
+		this.startDate = startDate;
47
+		this.endDate = endDate;
48
+		this.categories = category;
49
+		this.currentStartDate = startDate;
50
+		
51
+		this.monitoringStartDate = currentDate();
52
+		this.cacheNewsId = Lists.newArrayList();
53
+		
54
+		if(fastMode) sleepTime = fastModeSleepTime;
55
+		else sleepTime = safeModeSleepTime;
56
+		
57
+		//	현 시스템 날짜와 종료날짜가 동일하다면, 하루전 종료일짜까지 수집한 후, 현 시스템 날짜는 모니터링을 통해 실시간으로 수집, 날짜가 지나면 종료
58
+		//	'~' 표시인 경우 계속 수집
59
+		if(endDate.equals(this.autoModeTag)) {
60
+			this.continueCrawl = true;
61
+		}
62
+
63
+		//	종료날짜가 현 시스템 날짜보다 같거나 크다면, 현 시스템 날짜로 종료날짜를 등록하고 모니터링
64
+		if(this.continueCrawl || compareToDates(monitoringStartDate, endDate) >= 0) {
65
+			this.autoMonitor = true;
66
+		}
67
+	}
68
+	
69
+	
70
+	@Override
71
+	public void crawlling() throws Exception{
72
+		//	최초 스타트 일짜로 수행
73
+		fetch(startDate);
74
+		
75
+		//	장시간 기사를 수집할 시, 패치날짜의 간격이 커짐으로써, 모니터링 대상 날짜가 늘어나는 것을 방지하기 위해 한번 더 패치를 수행
76
+		String oldMonitoringStartDate = monitoringStartDate;
77
+		String currentSystemDate = currentDate();
78
+		if(updateSystemDateMonitoringDate(currentSystemDate)){
79
+			fetch(oldMonitoringStartDate);
80
+		}
81
+		
82
+		//	모니터링 모드일 경우 수행
83
+		if(autoMonitor) monitor();
84
+	}
85
+	
86
+	protected String currentDate(){
87
+		return new SimpleDateFormat("yyyyMMdd").format(new Date(System.currentTimeMillis()));
88
+	}
89
+	
90
+	protected boolean updateSystemDateMonitoringDate(String systemDate){
91
+		//	수집 시작한 날짜와 현 시스템의 날짜가 다른 경우,
92
+		if(!monitoringStartDate.equals(systemDate)) {
93
+			//	시스템 수집날짜를 현 시간으로 설정하여, 모니터링 대상 날짜로 지정
94
+			monitoringStartDate = systemDate;
95
+			return true;
96
+		}
97
+		return false;
98
+	}
99
+	
100
+	protected boolean isContinueCrawl(){
101
+		if(continueCrawl) return true;
102
+		else{
103
+			//	모니터링 날짜가 종료날짜보다 커지는 경우는 돌면 안됨. 
104
+			if(compareToDates(endDate, monitoringStartDate) > 0) return false;
105
+			else return true;
106
+		}
107
+	}
108
+	
109
+	protected List<String> getFetchDateList(String sourceDate){
110
+		List<String> fetchDateList = null;
111
+		
112
+		//	패치대상 날짜들은 현 시스템 날짜보다 하루 전 까지만 수행(모니터링 대상 날짜로부터)
113
+		if(autoMonitor) {
114
+			fetchDateList = getStringFormatDateList(sourceDate, monitoringStartDate);
115
+			fetchDateList.remove(fetchDateList.size()-1);
116
+		}
117
+		//	모니터링이 필요없는 경우
118
+		else fetchDateList = getStringFormatDateList(sourceDate, endDate);
119
+		return fetchDateList;
120
+	}
121
+	
122
+	protected List<String> getStringFormatDateList(String sourceDate, String targetDate){
123
+		List<String> fetchDateList = new ArrayList<String>();
124
+		int startYear = Integer.parseInt(sourceDate.substring(0, 4));
125
+		int startMonth = Integer.parseInt(sourceDate.substring(4, 6));
126
+		int startDay = Integer.parseInt(sourceDate.substring(6, 8));
127
+		int endYear = Integer.parseInt(targetDate.substring(0, 4));
128
+		int endMonth = Integer.parseInt(targetDate.substring(4, 6));
129
+		int endDay = Integer.parseInt(targetDate.substring(6, 8));
130
+		
131
+		int startTotalDay = totalDayFromCalendar(startYear, startMonth, startDay);
132
+		int endTotalDay = totalDayFromCalendar(endYear, endMonth, endDay);
133
+		
134
+		for(; startTotalDay <= endTotalDay; startTotalDay++){
135
+			int startEndDay = endDayFromTotalDay(startYear, startMonth);
136
+			if(startDay > startEndDay) {
137
+				startMonth++;
138
+				startDay = 1;
139
+			}
140
+			if(startMonth > 12){
141
+				startYear++;
142
+				startMonth = 1;
143
+				startDay = 1;
144
+			}
145
+			String fetchDate = startYear + StringUtils.leftPad(String.valueOf(startMonth), 2, "0") + StringUtils.leftPad(String.valueOf(startDay), 2, "0");
146
+			fetchDateList.add(fetchDate);
147
+			startDay++;
148
+		}
149
+		return fetchDateList;
150
+	}
151
+	
152
+	private int compareToDates(String sourceDate, String targetDate){
153
+		int startYear = Integer.parseInt(sourceDate.substring(0, 4));
154
+		int startMonth = Integer.parseInt(sourceDate.substring(4, 6));
155
+		int startDay = Integer.parseInt(sourceDate.substring(6, 8));
156
+		int endYear = Integer.parseInt(targetDate.substring(0, 4));
157
+		int endMonth = Integer.parseInt(targetDate.substring(4, 6));
158
+		int endDay = Integer.parseInt(targetDate.substring(6, 8));
159
+		
160
+		int startTotalDay = totalDayFromCalendar(startYear, startMonth, startDay);
161
+		int endTotalDay = totalDayFromCalendar(endYear, endMonth, endDay);
162
+		
163
+		return endTotalDay - startTotalDay;
164
+	}
165
+	
166
+	public static int totalDayFromCalendar(int year, int month, int day){
167
+		int totaldays;
168
+		totaldays = 365 * (year - 1);
169
+		for (int i = 1; i < year; i++) {
170
+			if (i % 4 == 0 && i % 100 != 0 || i % 400 == 0)
171
+				totaldays++;
172
+		}
173
+		// totaldays = 365 * (year-1) + (year-1)/4 - (year-1)/100 + (year-1)/400
174
+		int premonth = month - 1;
175
+		if (premonth >= 1)
176
+			totaldays += 31;
177
+		if (premonth >= 2)
178
+			totaldays += 28;
179
+		if (premonth >= 3)
180
+			totaldays += 31;
181
+		if (premonth >= 4)
182
+			totaldays += 30;
183
+		if (premonth >= 5)
184
+			totaldays += 31;
185
+		if (premonth >= 6)
186
+			totaldays += 30;
187
+		if (premonth >= 7)
188
+			totaldays += 31;
189
+		if (premonth >= 8)
190
+			totaldays += 31;
191
+		if (premonth >= 9)
192
+			totaldays += 30;
193
+		if (premonth >= 10)
194
+			totaldays += 31;
195
+		if (premonth >= 11)
196
+			totaldays += 30;
197
+		if (month > 2 && (year % 4 == 0 && year % 100 != 0 || year % 400 == 0))
198
+			totaldays++;
199
+		totaldays++;
200
+
201
+//		int day = totaldays % 7;
202
+		
203
+		totaldays = totaldays + day;
204
+		return totaldays;
205
+	}
206
+	
207
+	public static int endDayFromTotalDay(int year, int month){
208
+		int lastday;
209
+		switch (month) {
210
+		case 2:
211
+			lastday = 28;
212
+			if (year % 4 == 0 && year % 100 != 0 || year % 400 == 0)
213
+				lastday = 29;
214
+			else
215
+				lastday = 28;
216
+			break;
217
+		case 4:
218
+		case 6:
219
+		case 9:
220
+		case 11:
221
+			lastday = 30;
222
+			break;
223
+		default:
224
+			lastday = 31;
225
+		}
226
+		return lastday;
227
+	}
228
+	
229
+	
230
+	public synchronized void shutDown(){
231
+		this.articleBreakLatch.countDown();
232
+	}
233
+	
234
+	
235
+	public abstract void fetch(String sourceDate) throws Exception;
236
+	public abstract void monitor();
237
+}

+ 66
- 0
src/main/java/kr/co/swh/lecture/opensource/jsoup/news/ArticleCrawlerModel.java Ver fichero

@@ -0,0 +1,66 @@
1
+package kr.co.swh.lecture.opensource.jsoup.news;
2
+
3
+import java.io.Serializable;
4
+
5
+public class ArticleCrawlerModel implements Serializable{
6
+	/**
7
+	 * 
8
+	 */
9
+	private static final long serialVersionUID = 1L;
10
+	
11
+	private String id;
12
+	private String category;
13
+	private String url;
14
+	private String title;
15
+	private String regDate;
16
+	private String contents;
17
+	
18
+	
19
+	public String getId() {
20
+		return id;
21
+	}
22
+	public void setId(String id) {
23
+		this.id = id;
24
+	}
25
+	public String getCategory() {
26
+		return category;
27
+	}
28
+	public void setCategory(String category) {
29
+		this.category = category;
30
+	}
31
+	public String getUrl() {
32
+		return url;
33
+	}
34
+	public void setUrl(String url) {
35
+		this.url = url;
36
+	}
37
+	public String getTitle() {
38
+		return title;
39
+	}
40
+	public void setTitle(String title) {
41
+		this.title = title;
42
+	}
43
+	public String getRegDate() {
44
+		return regDate;
45
+	}
46
+	public void setRegDate(String regDate) {
47
+		this.regDate = regDate;
48
+	}
49
+	public String getContents() {
50
+		return contents;
51
+	}
52
+	public void setContents(String contents) {
53
+		this.contents = contents;
54
+	}
55
+	@Override
56
+	public String toString() {
57
+		return "News{" +
58
+				"regDate=" + regDate +
59
+				", category=" + category +
60
+				", id=" + id +
61
+				", url=" + url +
62
+				", title=" + title +
63
+				", contents=" + contents +
64
+				'}';
65
+	}
66
+}

+ 8
- 0
src/main/java/kr/co/swh/lecture/opensource/jsoup/news/ICrawlService.java Ver fichero

@@ -0,0 +1,8 @@
1
+package kr.co.swh.lecture.opensource.jsoup.news;
2
+
3
+public interface ICrawlService {
4
+	public void crawlling() throws Exception;
5
+	public void fetch(String sourceDate) throws Exception;
6
+	public void monitor();
7
+	public void shutDown();
8
+}

+ 288
- 0
src/main/java/kr/co/swh/lecture/opensource/jsoup/news/NaverNewsCrawlSource.java Ver fichero

@@ -0,0 +1,288 @@
1
+package kr.co.swh.lecture.opensource.jsoup.news;
2
+
3
+import java.io.IOException;
4
+import java.util.HashMap;
5
+import java.util.Iterator;
6
+import java.util.LinkedList;
7
+import java.util.List;
8
+import java.util.Map;
9
+import java.util.regex.Matcher;
10
+import java.util.regex.Pattern;
11
+
12
+import org.jsoup.Jsoup;
13
+import org.jsoup.nodes.Document;
14
+import org.jsoup.nodes.Element;
15
+import org.jsoup.select.Elements;
16
+
17
+import com.google.common.collect.Lists;
18
+
19
+import scala.Tuple2;
20
+
21
+
22
+public class NaverNewsCrawlSource 
23
+{
24
+	private String domain, startDate, endDate;
25
+	private String[] category;
26
+	private Boolean fastMode;
27
+	
28
+	private ICrawlService service;
29
+	private final int maxContentsLength = 300000;
30
+	public static LinkedList<String> execTailQueue = new LinkedList<String>();
31
+	
32
+	//	HTML TAG
33
+	public final static String TAG_SCRIPT = "script";
34
+	public final static String META_KEY_CONTENT = "content";
35
+	public final static String TAG_A = "a";
36
+	public final static String TAG_IMG = "img";
37
+	public final static String TAG_HREF = "href";
38
+	public final static String TAG_HTTP = "http";
39
+	
40
+	//	REGX
41
+	public final static String REGX_META_CONTENTS_TITLE = "meta[property=og:title]";
42
+	public final static String REGX_META_CONTENTS_REGDATE = "meta[property=og:regDate]";
43
+	public final static String REGX_META_CONTENTS_PUBDATE = "meta[property=article:published_time]";
44
+	
45
+	//////////////////////////////////////////////////////
46
+	//	NAVER
47
+	//	CATEGORY
48
+	//	정치, 경제, 사회, 문화, 세계, 과학, IT  (except. 연예, 스포츠, 기후)
49
+	public final static String[] NAVER_CATEGORY = {"politics", "economy", "society", "culture", "world", "science", "info-tech", "sports", "entertainment", "weather"};
50
+	public final static Map<String, Tuple2<String, String>> NAVER_CATEGORY_MAP = new HashMap<String, Tuple2<String, String>>();
51
+	static{
52
+		NAVER_CATEGORY_MAP.put(NAVER_CATEGORY[0], new Tuple2<String, String>("sid1=100", "sid2=269"));
53
+		NAVER_CATEGORY_MAP.put(NAVER_CATEGORY[1], new Tuple2<String, String>("sid1=101", "sid2=263"));
54
+		NAVER_CATEGORY_MAP.put(NAVER_CATEGORY[2], new Tuple2<String, String>("sid1=102", "sid2=257"));
55
+		NAVER_CATEGORY_MAP.put(NAVER_CATEGORY[3], new Tuple2<String, String>("sid1=103", "sid2=245"));
56
+		NAVER_CATEGORY_MAP.put(NAVER_CATEGORY[4], new Tuple2<String, String>("sid1=104", "sid2=322"));
57
+		NAVER_CATEGORY_MAP.put(NAVER_CATEGORY[5], new Tuple2<String, String>("sid1=105", "sid2=228"));
58
+		NAVER_CATEGORY_MAP.put(NAVER_CATEGORY[6], new Tuple2<String, String>("sid1=105", "sid2=230"));
59
+		NAVER_CATEGORY_MAP.put(NAVER_CATEGORY[7], new Tuple2<String, String>("sid1=100", "sid2=269"));
60
+		NAVER_CATEGORY_MAP.put(NAVER_CATEGORY[8], new Tuple2<String, String>("sid1=100", "sid2=269"));
61
+		NAVER_CATEGORY_MAP.put(NAVER_CATEGORY[9], new Tuple2<String, String>("sid1=100", "sid2=269"));
62
+	}
63
+	//	URL
64
+	public final static String NAVER_URL_HEAD = "http://news.naver.com";
65
+	public final static String NAVER_FETCH_URL_HEAD = NAVER_URL_HEAD + "/main/list.nhn?%s&%s&mid=shm&mode=LS2D&date=%s&page=%s";	//	sid1, sid2, date, paging
66
+	
67
+	//	NAVER REGX
68
+	public final static String NAVER_REGX_FETCH_SUBURL = "a[href]";
69
+	public final static String NAVER_REGX_FETCH_NEWS_ID = "(?<=(&oid=|\\?oid=)|(&aid=|\\?aid=))[\\d]+";
70
+	
71
+	public final static String NAVER_REGX_CONTENTS_REGDATE_GET = ".sponsor > span";
72
+	public final static String NAVER_REGX_CONTENTS_GET = "#articleBodyContents";
73
+	
74
+	public void init() {
75
+		domain = "naver";
76
+		category = new String[]{"politics"};
77
+		startDate = "20191019";
78
+		endDate = "20191020";
79
+		fastMode = false;
80
+		service = new NaverCrawlManager(domain, startDate, endDate, category, fastMode);
81
+		
82
+		try {
83
+			service.crawlling();
84
+		} catch (Exception e) {
85
+			// TODO Auto-generated catch block
86
+			e.printStackTrace();
87
+		}
88
+	}
89
+
90
+	public static void main(String[] args) {
91
+		
92
+		NaverNewsCrawlSource source = new NaverNewsCrawlSource();
93
+		source.init();
94
+	}
95
+	
96
+	class NaverCrawlManager extends A_CrawlClient{
97
+		
98
+		public NaverCrawlManager(String domain, String startDate, String endDate, String[] category, boolean fastMode)
99
+		{
100
+			super(domain, startDate, endDate, category, fastMode);
101
+		}
102
+		
103
+		@Override
104
+		public void fetch(String sourceDate) throws Exception{
105
+			List<String> fetchDateList = getFetchDateList(sourceDate);
106
+			rootBreak:
107
+				for(String crawlDate : fetchDateList){
108
+					currentStartDate = crawlDate;
109
+					for(String category : categories){
110
+						for(int pageIndex = 1; pageIndex < Integer.MAX_VALUE; pageIndex++){
111
+							//	shutdown
112
+							if(articleBreakLatch.getCount() < 1) break rootBreak;
113
+							try {
114
+								
115
+								List<String> findNewsDocList = getArticleURLHeader(pageIndex, category, crawlDate);
116
+								int cnt = findNewsDocList.size();
117
+								if(cnt <= 0) {
118
+									break;
119
+								}
120
+								contentsCrawlling(findNewsDocList, category);
121
+								
122
+								Thread.sleep(sleepTime);
123
+							} catch (IOException | InterruptedException e) {
124
+								// TODO Auto-generated catch block
125
+								throw e;
126
+							}
127
+						}
128
+					}
129
+					cacheNewsId.clear();
130
+				}
131
+		}
132
+		
133
+		@Override
134
+		public void monitor(){
135
+			new Thread(new Runnable(){
136
+				@Override
137
+				public void run() {
138
+					// TODO Auto-generated method stub
139
+					while(true){
140
+						String currentSystemDate = currentDate();
141
+						
142
+						List<String> monitorArticleList = getStringFormatDateList(monitoringStartDate, currentSystemDate);
143
+						for(String crawlDate : monitorArticleList){
144
+							currentStartDate = crawlDate;
145
+							for(String category : categories){
146
+								int checkPageBlockNum = monitoringNewPageBlockNum;
147
+								for(int pageIndex = 1; pageIndex < Integer.MAX_VALUE; pageIndex++){
148
+									try {
149
+										List<String> findNewsDocList = getArticleURLHeader(pageIndex, category, crawlDate);
150
+										if(findNewsDocList.size() <= 0) {
151
+											checkPageBlockNum--;
152
+										}
153
+										else checkPageBlockNum = monitoringNewPageBlockNum;
154
+										
155
+										if(checkPageBlockNum <= 0) {
156
+											checkPageBlockNum = monitoringNewPageBlockNum;
157
+											break;
158
+										}
159
+										contentsCrawlling(findNewsDocList, category);
160
+										
161
+										Thread.sleep(sleepTime);
162
+									} catch (IOException | InterruptedException e) {
163
+										// TODO Auto-generated catch block
164
+										articleBreakLatch.countDown();
165
+									}
166
+								}
167
+							}
168
+						}
169
+						if(updateSystemDateMonitoringDate(currentSystemDate)) cacheNewsId.clear();
170
+						if(!isContinueCrawl()) {
171
+							articleBreakLatch.countDown();
172
+						}
173
+					}
174
+				}
175
+			}).start();
176
+			
177
+			try {
178
+				articleBreakLatch.await();
179
+			} catch (InterruptedException e) {
180
+				// TODO Auto-generated catch block
181
+			} finally {
182
+			}
183
+		}
184
+		
185
+		private List<String> getArticleURLHeader(int pageIndex, String category, String crawlDate) throws IOException{
186
+			List<String> findNewsDocList = Lists.newArrayList();
187
+			String page = String.valueOf(pageIndex);
188
+			Tuple2<String, String> seedKey = NAVER_CATEGORY_MAP.get(category);
189
+			String fetchURL = String.format(NAVER_FETCH_URL_HEAD, seedKey._1(), seedKey._2(), crawlDate, page);
190
+			Document fetchData = Jsoup.connect(fetchURL).timeout(0).get();
191
+			Elements fetchTmpDatas = fetchData.select(NAVER_REGX_FETCH_SUBURL);
192
+			for(Element doc : fetchTmpDatas){
193
+				if(!doc.select(TAG_A).is(TAG_IMG) && 
194
+						doc.attr(TAG_HREF).startsWith(TAG_HTTP) &&
195
+						doc.attr(TAG_HREF).contains(seedKey._1()) &&
196
+						doc.attr(TAG_HREF).contains(seedKey._2())){
197
+					
198
+					//	네이버에서는 페이지 카운트에 대한 종료를 위해서는 페이지의 주소 페이지 주소로 종료
199
+					if(cacheNewsId.contains(doc.attr(TAG_HREF))) continue;
200
+					cacheNewsId.add(doc.attr(TAG_HREF));
201
+					findNewsDocList.add(doc.attr(TAG_HREF));
202
+				}
203
+			}
204
+			return findNewsDocList;
205
+		}
206
+		
207
+		//	CONTENTS GET
208
+		private void contentsCrawlling(List<String> findNewsDocList, String category) throws IOException{
209
+			int retry = 3;
210
+			for(int index = 0; index < findNewsDocList.size(); index++){
211
+				String newsDocUrl = findNewsDocList.get(index);
212
+				Document crawlDoc;
213
+				try {
214
+					crawlDoc = Jsoup.connect(newsDocUrl).timeout(0).get();
215
+				} catch (IOException e) {
216
+					// TODO Auto-generated catch block
217
+					if(retry < 0) throw e;
218
+					index--;
219
+					continue;
220
+				}
221
+
222
+				//	newsId
223
+				String newsId = "";
224
+				Pattern pattern = Pattern.compile(NAVER_REGX_FETCH_NEWS_ID, Pattern.CASE_INSENSITIVE);
225
+				Matcher m = pattern.matcher(crawlDoc.baseUri());
226
+				int findIndex = 0;
227
+				while(m.find(findIndex)){
228
+					newsId += crawlDoc.baseUri().substring(m.start(), m.end());
229
+					newsId += "-";
230
+					findIndex = m.end();
231
+				}
232
+				newsId = newsId.substring(0, newsId.length()-1);
233
+				
234
+				//	title
235
+				String title = "";
236
+				Elements metaOgTitle = crawlDoc.select(REGX_META_CONTENTS_TITLE);
237
+				if (metaOgTitle!=null) {
238
+					title = metaOgTitle.attr(META_KEY_CONTENT);
239
+				}
240
+				
241
+				//	regDate
242
+				String regDate = "";
243
+				Elements metaOgRegDate = crawlDoc.select(NAVER_REGX_CONTENTS_REGDATE_GET);
244
+				if (metaOgRegDate != null) {
245
+					regDate = metaOgRegDate.text();
246
+				}
247
+
248
+				StringBuilder contentsBuilder = new StringBuilder();
249
+				Elements articleBodyContentsRows = crawlDoc.select(NAVER_REGX_CONTENTS_GET);
250
+				String contents;
251
+				int docContentsLength = 0;
252
+				boolean contentsStatus = false;
253
+				loop:
254
+				for (Element articleBodyContentsRow : articleBodyContentsRows) {
255
+					Iterator<Element> iterElem = articleBodyContentsRow.getAllElements().iterator();
256
+					while(iterElem.hasNext()){
257
+						String text = iterElem.next().text();
258
+						docContentsLength += text.length();
259
+						contentsBuilder.append(text);
260
+						if(docContentsLength > maxContentsLength){
261
+							contentsStatus = true;
262
+							break loop;
263
+						}
264
+					}
265
+				}
266
+				if(contentsStatus) contents = contentsBuilder.toString().substring(0, maxContentsLength);
267
+				else contents = contentsBuilder.toString();
268
+				
269
+
270
+				ArticleCrawlerModel article = new ArticleCrawlerModel();
271
+				article.setId(newsId);
272
+				article.setCategory(category);
273
+				article.setUrl(crawlDoc.baseUri());
274
+				article.setTitle(title);
275
+				article.setRegDate(regDate);
276
+				article.setContents(contents);
277
+				System.out.println(article);
278
+			}
279
+		}
280
+		
281
+		@Override
282
+		public void shutDown(){
283
+			super.shutDown();
284
+		}
285
+		
286
+	}
287
+}
288
+

+ 2
- 0
src/main/resources/application.properties Ver fichero

@@ -0,0 +1,2 @@
1
+demo.value=test
2
+demo.type=dev