|
|
@@ -0,0 +1,323 @@
|
|
|
1
|
+package kr.co.swh.lecture.opensource.project.news;
|
|
|
2
|
+
|
|
|
3
|
+import java.io.IOException;
|
|
|
4
|
+import java.sql.Connection;
|
|
|
5
|
+import java.sql.DriverManager;
|
|
|
6
|
+import java.sql.PreparedStatement;
|
|
|
7
|
+import java.sql.SQLException;
|
|
|
8
|
+import java.util.HashMap;
|
|
|
9
|
+import java.util.Iterator;
|
|
|
10
|
+import java.util.LinkedList;
|
|
|
11
|
+import java.util.List;
|
|
|
12
|
+import java.util.Map;
|
|
|
13
|
+import java.util.regex.Matcher;
|
|
|
14
|
+import java.util.regex.Pattern;
|
|
|
15
|
+
|
|
|
16
|
+import org.jsoup.Jsoup;
|
|
|
17
|
+import org.jsoup.nodes.Document;
|
|
|
18
|
+import org.jsoup.nodes.Element;
|
|
|
19
|
+import org.jsoup.select.Elements;
|
|
|
20
|
+
|
|
|
21
|
+import com.google.common.collect.Lists;
|
|
|
22
|
+
|
|
|
23
|
+import scala.Tuple2;
|
|
|
24
|
+
|
|
|
25
|
+
|
|
|
26
|
+public class NaverNewsCrawlSourceDB
|
|
|
27
|
+{
|
|
|
28
|
+ private String domain, startDate, endDate;
|
|
|
29
|
+ private String[] category;
|
|
|
30
|
+ private Boolean fastMode;
|
|
|
31
|
+
|
|
|
32
|
+ private ICrawlService service;
|
|
|
33
|
+ private final int maxContentsLength = 300000;
|
|
|
34
|
+ public static LinkedList<String> execTailQueue = new LinkedList<String>();
|
|
|
35
|
+
|
|
|
36
|
+ // HTML TAG
|
|
|
37
|
+ public final static String TAG_SCRIPT = "script";
|
|
|
38
|
+ public final static String META_KEY_CONTENT = "content";
|
|
|
39
|
+ public final static String TAG_A = "a";
|
|
|
40
|
+ public final static String TAG_IMG = "img";
|
|
|
41
|
+ public final static String TAG_HREF = "href";
|
|
|
42
|
+ public final static String TAG_HTTP = "http";
|
|
|
43
|
+
|
|
|
44
|
+ // REGX
|
|
|
45
|
+ public final static String REGX_META_CONTENTS_TITLE = "meta[property=og:title]";
|
|
|
46
|
+ public final static String REGX_META_CONTENTS_REGDATE = "meta[property=og:regDate]";
|
|
|
47
|
+ public final static String REGX_META_CONTENTS_PUBDATE = "meta[property=article:published_time]";
|
|
|
48
|
+
|
|
|
49
|
+ //////////////////////////////////////////////////////
|
|
|
50
|
+ // NAVER
|
|
|
51
|
+ // CATEGORY
|
|
|
52
|
+ // 정치, 경제, 사회, 문화, 세계, 과학, IT (except. 연예, 스포츠, 기후)
|
|
|
53
|
+ public final static String[] NAVER_CATEGORY = {"politics", "economy", "society", "culture", "world", "science", "info-tech", "sports", "entertainment", "weather"};
|
|
|
54
|
+ public final static Map<String, Tuple2<String, String>> NAVER_CATEGORY_MAP = new HashMap<String, Tuple2<String, String>>();
|
|
|
55
|
+ static{
|
|
|
56
|
+ NAVER_CATEGORY_MAP.put(NAVER_CATEGORY[0], new Tuple2<String, String>("sid1=100", "sid2=269"));
|
|
|
57
|
+ NAVER_CATEGORY_MAP.put(NAVER_CATEGORY[1], new Tuple2<String, String>("sid1=101", "sid2=263"));
|
|
|
58
|
+ NAVER_CATEGORY_MAP.put(NAVER_CATEGORY[2], new Tuple2<String, String>("sid1=102", "sid2=257"));
|
|
|
59
|
+ NAVER_CATEGORY_MAP.put(NAVER_CATEGORY[3], new Tuple2<String, String>("sid1=103", "sid2=245"));
|
|
|
60
|
+ NAVER_CATEGORY_MAP.put(NAVER_CATEGORY[4], new Tuple2<String, String>("sid1=104", "sid2=322"));
|
|
|
61
|
+ NAVER_CATEGORY_MAP.put(NAVER_CATEGORY[5], new Tuple2<String, String>("sid1=105", "sid2=228"));
|
|
|
62
|
+ NAVER_CATEGORY_MAP.put(NAVER_CATEGORY[6], new Tuple2<String, String>("sid1=105", "sid2=230"));
|
|
|
63
|
+ NAVER_CATEGORY_MAP.put(NAVER_CATEGORY[7], new Tuple2<String, String>("sid1=100", "sid2=269"));
|
|
|
64
|
+ NAVER_CATEGORY_MAP.put(NAVER_CATEGORY[8], new Tuple2<String, String>("sid1=100", "sid2=269"));
|
|
|
65
|
+ NAVER_CATEGORY_MAP.put(NAVER_CATEGORY[9], new Tuple2<String, String>("sid1=100", "sid2=269"));
|
|
|
66
|
+ }
|
|
|
67
|
+ // URL
|
|
|
68
|
+ public final static String NAVER_URL_HEAD = "http://news.naver.com";
|
|
|
69
|
+ public final static String NAVER_FETCH_URL_HEAD = NAVER_URL_HEAD + "/main/list.nhn?%s&%s&mid=shm&mode=LS2D&date=%s&page=%s"; // sid1, sid2, date, paging
|
|
|
70
|
+
|
|
|
71
|
+ // NAVER REGX
|
|
|
72
|
+ public final static String NAVER_REGX_FETCH_SUBURL = "a[href]";
|
|
|
73
|
+ public final static String NAVER_REGX_FETCH_NEWS_ID = "(?<=(&oid=|\\?oid=)|(&aid=|\\?aid=))[\\d]+";
|
|
|
74
|
+
|
|
|
75
|
+ public final static String NAVER_REGX_CONTENTS_REGDATE_GET = ".sponsor > span";
|
|
|
76
|
+ public final static String NAVER_REGX_CONTENTS_GET = "#articleBodyContents";
|
|
|
77
|
+ Connection connection = null;
|
|
|
78
|
+ PreparedStatement statement = null;
|
|
|
79
|
+ static final String JDBC_DRIVER = "com.mysql.jdbc.Driver";
|
|
|
80
|
+ static final String DB_URL = "jdbc:mysql://dev-swh.ga:3306/market";
|
|
|
81
|
+
|
|
|
82
|
+ static final String USERNAME = "root";
|
|
|
83
|
+ static final String PASSWORD = "swhacademy!";
|
|
|
84
|
+
|
|
|
85
|
+ public void init() {
|
|
|
86
|
+ domain = "naver";
|
|
|
87
|
+ category = new String[]{"politics", "economy", "society", "culture", "world", "science", "info-tech", "sports", "entertainment", "weather"};
|
|
|
88
|
+ startDate = "20190101";
|
|
|
89
|
+ endDate = "20190131";
|
|
|
90
|
+ fastMode = true;
|
|
|
91
|
+ service = new NaverCrawlManager(domain, startDate, endDate, category, fastMode);
|
|
|
92
|
+
|
|
|
93
|
+ try {
|
|
|
94
|
+ Class.forName(JDBC_DRIVER);
|
|
|
95
|
+ connection = DriverManager.getConnection(DB_URL,USERNAME,PASSWORD);
|
|
|
96
|
+ System.out.println("MariaDB 연결.");
|
|
|
97
|
+ service.crawlling();
|
|
|
98
|
+ } catch (Exception e) {
|
|
|
99
|
+ // TODO Auto-generated catch block
|
|
|
100
|
+ e.printStackTrace();
|
|
|
101
|
+ }
|
|
|
102
|
+ }
|
|
|
103
|
+
|
|
|
104
|
+ public static void main(String[] args) {
|
|
|
105
|
+
|
|
|
106
|
+ NaverNewsCrawlSourceDB source = new NaverNewsCrawlSourceDB();
|
|
|
107
|
+ source.init();
|
|
|
108
|
+// System.out.println(NaverCrawlManager.totalDayFromCalendar(2019,10,20));
|
|
|
109
|
+// System.out.println(NaverCrawlManager.endDayFromTotalDay(2019,10));
|
|
|
110
|
+// System.out.println(StringUtils.leftPad(String.valueOf(1), 2, "0"));
|
|
|
111
|
+// System.out.println(StringUtils.leftPad(String.valueOf(9), 5, "4"));
|
|
|
112
|
+ }
|
|
|
113
|
+
|
|
|
114
|
+ class NaverCrawlManager extends A_CrawlClient{
|
|
|
115
|
+
|
|
|
116
|
+ public NaverCrawlManager(String domain, String startDate, String endDate, String[] category, boolean fastMode)
|
|
|
117
|
+ {
|
|
|
118
|
+ super(domain, startDate, endDate, category, fastMode);
|
|
|
119
|
+ }
|
|
|
120
|
+
|
|
|
121
|
+ @Override
|
|
|
122
|
+ public void fetch(String sourceDate) throws Exception{
|
|
|
123
|
+ List<String> fetchDateList = getFetchDateList(sourceDate);
|
|
|
124
|
+ rootBreak:
|
|
|
125
|
+ for(String crawlDate : fetchDateList){
|
|
|
126
|
+ currentStartDate = crawlDate;
|
|
|
127
|
+ for(String category : categories){
|
|
|
128
|
+ System.out.println(currentStartDate + " : " + category);
|
|
|
129
|
+ for(int pageIndex = 1; pageIndex < Integer.MAX_VALUE; pageIndex++){
|
|
|
130
|
+ // shutdown
|
|
|
131
|
+ if(articleBreakLatch.getCount() < 1) break rootBreak;
|
|
|
132
|
+ try {
|
|
|
133
|
+
|
|
|
134
|
+ List<String> findNewsDocList = getArticleURLHeader(pageIndex, category, crawlDate);
|
|
|
135
|
+ int cnt = findNewsDocList.size();
|
|
|
136
|
+ if(cnt <= 0) {
|
|
|
137
|
+ break;
|
|
|
138
|
+ }
|
|
|
139
|
+ contentsCrawlling(findNewsDocList, category);
|
|
|
140
|
+
|
|
|
141
|
+ Thread.sleep(sleepTime);
|
|
|
142
|
+ } catch (IOException | InterruptedException e) {
|
|
|
143
|
+ // TODO Auto-generated catch block
|
|
|
144
|
+ throw e;
|
|
|
145
|
+ }
|
|
|
146
|
+ }
|
|
|
147
|
+
|
|
|
148
|
+ }
|
|
|
149
|
+ cacheNewsId.clear();
|
|
|
150
|
+ }
|
|
|
151
|
+ }
|
|
|
152
|
+
|
|
|
153
|
+ @Override
|
|
|
154
|
+ public void monitor(){
|
|
|
155
|
+ new Thread(new Runnable(){
|
|
|
156
|
+ @Override
|
|
|
157
|
+ public void run() {
|
|
|
158
|
+ // TODO Auto-generated method stub
|
|
|
159
|
+ while(true){
|
|
|
160
|
+ String currentSystemDate = currentDate();
|
|
|
161
|
+
|
|
|
162
|
+ List<String> monitorArticleList = getStringFormatDateList(monitoringStartDate, currentSystemDate);
|
|
|
163
|
+ for(String crawlDate : monitorArticleList){
|
|
|
164
|
+ currentStartDate = crawlDate;
|
|
|
165
|
+ for(String category : categories){
|
|
|
166
|
+ int checkPageBlockNum = monitoringNewPageBlockNum;
|
|
|
167
|
+ for(int pageIndex = 1; pageIndex < Integer.MAX_VALUE; pageIndex++){
|
|
|
168
|
+ try {
|
|
|
169
|
+ List<String> findNewsDocList = getArticleURLHeader(pageIndex, category, crawlDate);
|
|
|
170
|
+ if(findNewsDocList.size() <= 0) {
|
|
|
171
|
+ checkPageBlockNum--;
|
|
|
172
|
+ }
|
|
|
173
|
+ else checkPageBlockNum = monitoringNewPageBlockNum;
|
|
|
174
|
+
|
|
|
175
|
+ if(checkPageBlockNum <= 0) {
|
|
|
176
|
+ checkPageBlockNum = monitoringNewPageBlockNum;
|
|
|
177
|
+ break;
|
|
|
178
|
+ }
|
|
|
179
|
+ contentsCrawlling(findNewsDocList, category);
|
|
|
180
|
+
|
|
|
181
|
+ Thread.sleep(sleepTime);
|
|
|
182
|
+ } catch (IOException | InterruptedException e) {
|
|
|
183
|
+ // TODO Auto-generated catch block
|
|
|
184
|
+ articleBreakLatch.countDown();
|
|
|
185
|
+ }
|
|
|
186
|
+ }
|
|
|
187
|
+ }
|
|
|
188
|
+ }
|
|
|
189
|
+ if(updateSystemDateMonitoringDate(currentSystemDate)) cacheNewsId.clear();
|
|
|
190
|
+ if(!isContinueCrawl()) {
|
|
|
191
|
+ articleBreakLatch.countDown();
|
|
|
192
|
+ }
|
|
|
193
|
+ }
|
|
|
194
|
+ }
|
|
|
195
|
+ }).start();
|
|
|
196
|
+
|
|
|
197
|
+ try {
|
|
|
198
|
+ articleBreakLatch.await();
|
|
|
199
|
+ } catch (InterruptedException e) {
|
|
|
200
|
+ // TODO Auto-generated catch block
|
|
|
201
|
+ } finally {
|
|
|
202
|
+ }
|
|
|
203
|
+ }
|
|
|
204
|
+
|
|
|
205
|
+ private List<String> getArticleURLHeader(int pageIndex, String category, String crawlDate) throws IOException{
|
|
|
206
|
+ List<String> findNewsDocList = Lists.newArrayList();
|
|
|
207
|
+ String page = String.valueOf(pageIndex);
|
|
|
208
|
+ Tuple2<String, String> seedKey = NAVER_CATEGORY_MAP.get(category);
|
|
|
209
|
+ String fetchURL = String.format(NAVER_FETCH_URL_HEAD, seedKey._1(), seedKey._2(), crawlDate, page);
|
|
|
210
|
+ Document fetchData = Jsoup.connect(fetchURL).timeout(0).get();
|
|
|
211
|
+ Elements fetchTmpDatas = fetchData.select(NAVER_REGX_FETCH_SUBURL);
|
|
|
212
|
+ for(Element doc : fetchTmpDatas){
|
|
|
213
|
+ if(!doc.select(TAG_A).is(TAG_IMG) &&
|
|
|
214
|
+ doc.attr(TAG_HREF).startsWith(TAG_HTTP) &&
|
|
|
215
|
+ doc.attr(TAG_HREF).contains(seedKey._1()) &&
|
|
|
216
|
+ doc.attr(TAG_HREF).contains(seedKey._2())){
|
|
|
217
|
+
|
|
|
218
|
+ // 네이버에서는 페이지 카운트에 대한 종료를 위해서는 페이지의 주소 페이지 주소로 종료
|
|
|
219
|
+ if(cacheNewsId.contains(doc.attr(TAG_HREF))) continue;
|
|
|
220
|
+ cacheNewsId.add(doc.attr(TAG_HREF));
|
|
|
221
|
+ findNewsDocList.add(doc.attr(TAG_HREF));
|
|
|
222
|
+ }
|
|
|
223
|
+ }
|
|
|
224
|
+ return findNewsDocList;
|
|
|
225
|
+ }
|
|
|
226
|
+
|
|
|
227
|
+ // CONTENTS GET
|
|
|
228
|
+ private void contentsCrawlling(List<String> findNewsDocList, String category) throws IOException{
|
|
|
229
|
+ int retry = 3;
|
|
|
230
|
+ for(int index = 0; index < findNewsDocList.size(); index++){
|
|
|
231
|
+ String newsDocUrl = findNewsDocList.get(index);
|
|
|
232
|
+ Document crawlDoc;
|
|
|
233
|
+ try {
|
|
|
234
|
+ crawlDoc = Jsoup.connect(newsDocUrl).timeout(0).get();
|
|
|
235
|
+ } catch (IOException e) {
|
|
|
236
|
+ // TODO Auto-generated catch block
|
|
|
237
|
+ if(retry < 0) throw e;
|
|
|
238
|
+ index--;
|
|
|
239
|
+ continue;
|
|
|
240
|
+ }
|
|
|
241
|
+
|
|
|
242
|
+ // newsId
|
|
|
243
|
+ String newsId = "";
|
|
|
244
|
+ Pattern pattern = Pattern.compile(NAVER_REGX_FETCH_NEWS_ID, Pattern.CASE_INSENSITIVE);
|
|
|
245
|
+ Matcher m = pattern.matcher(crawlDoc.baseUri());
|
|
|
246
|
+ int findIndex = 0;
|
|
|
247
|
+ while(m.find(findIndex)){
|
|
|
248
|
+ newsId += crawlDoc.baseUri().substring(m.start(), m.end());
|
|
|
249
|
+ newsId += "-";
|
|
|
250
|
+ findIndex = m.end();
|
|
|
251
|
+ }
|
|
|
252
|
+ newsId = newsId.substring(0, newsId.length()-1);
|
|
|
253
|
+
|
|
|
254
|
+ // title
|
|
|
255
|
+ String title = "";
|
|
|
256
|
+ Elements metaOgTitle = crawlDoc.select(REGX_META_CONTENTS_TITLE);
|
|
|
257
|
+ if (metaOgTitle!=null) {
|
|
|
258
|
+ title = metaOgTitle.attr(META_KEY_CONTENT);
|
|
|
259
|
+ }
|
|
|
260
|
+
|
|
|
261
|
+ // regDate
|
|
|
262
|
+ String regDate = "";
|
|
|
263
|
+ Elements metaOgRegDate = crawlDoc.select(NAVER_REGX_CONTENTS_REGDATE_GET);
|
|
|
264
|
+ if (metaOgRegDate != null) {
|
|
|
265
|
+ regDate = metaOgRegDate.text();
|
|
|
266
|
+ }
|
|
|
267
|
+
|
|
|
268
|
+ StringBuilder contentsBuilder = new StringBuilder();
|
|
|
269
|
+ Elements articleBodyContentsRows = crawlDoc.select(NAVER_REGX_CONTENTS_GET);
|
|
|
270
|
+ String contents;
|
|
|
271
|
+ int docContentsLength = 0;
|
|
|
272
|
+ boolean contentsStatus = false;
|
|
|
273
|
+ loop:
|
|
|
274
|
+ for (Element articleBodyContentsRow : articleBodyContentsRows) {
|
|
|
275
|
+ Iterator<Element> iterElem = articleBodyContentsRow.getAllElements().iterator();
|
|
|
276
|
+ while(iterElem.hasNext()){
|
|
|
277
|
+ String text = iterElem.next().text();
|
|
|
278
|
+ docContentsLength += text.length();
|
|
|
279
|
+ contentsBuilder.append(text);
|
|
|
280
|
+ if(docContentsLength > maxContentsLength){
|
|
|
281
|
+ contentsStatus = true;
|
|
|
282
|
+ break loop;
|
|
|
283
|
+ }
|
|
|
284
|
+ }
|
|
|
285
|
+ }
|
|
|
286
|
+ if(contentsStatus) contents = contentsBuilder.toString().substring(0, maxContentsLength);
|
|
|
287
|
+ else contents = contentsBuilder.toString();
|
|
|
288
|
+
|
|
|
289
|
+ try {
|
|
|
290
|
+ if(connection.isClosed()) {
|
|
|
291
|
+ connection = DriverManager.getConnection(DB_URL,USERNAME,PASSWORD);
|
|
|
292
|
+ }
|
|
|
293
|
+ String sql = "insert into stock_news(id,category,url,title,regdate,contents) values(?,?,?,?,?,?);";
|
|
|
294
|
+
|
|
|
295
|
+ connection.setAutoCommit(false);
|
|
|
296
|
+
|
|
|
297
|
+ statement = connection.prepareStatement(sql);
|
|
|
298
|
+ statement.setString(1, newsId);
|
|
|
299
|
+ statement.setString(2, category);
|
|
|
300
|
+ statement.setString(3, crawlDoc.baseUri());
|
|
|
301
|
+ statement.setString(4, title);
|
|
|
302
|
+ statement.setString(5, regDate);
|
|
|
303
|
+ statement.setString(6, contents);
|
|
|
304
|
+ statement.executeUpdate();
|
|
|
305
|
+
|
|
|
306
|
+ connection.commit();
|
|
|
307
|
+
|
|
|
308
|
+ } catch (SQLException e) {
|
|
|
309
|
+ // TODO Auto-generated catch block
|
|
|
310
|
+// e.printStackTrace();
|
|
|
311
|
+ }
|
|
|
312
|
+ }
|
|
|
313
|
+ }
|
|
|
314
|
+
|
|
|
315
|
+ @Override
|
|
|
316
|
+ public void shutDown(){
|
|
|
317
|
+ super.shutDown();
|
|
|
318
|
+ }
|
|
|
319
|
+
|
|
|
320
|
+ }
|
|
|
321
|
+
|
|
|
322
|
+}
|
|
|
323
|
+
|