tobby48 5 年前
父节点
当前提交
d33fff4b6c

+ 100
- 0
opensource.log.20200213 查看文件

@@ -0,0 +1,100 @@
1
+[16:38:12.779][ERROR][LoggerFactory$Log4jLogger:   59] - Error executing FreeMarker template
2
+FreeMarker template error:
3
+For "." left-hand operand: Expected a hash, but this has evaluated to a string (wrapper: f.t.SimpleScalar):
4
+==> i  [in template "test.ftl" at line 48, column 8]
5
+
6
+----
7
+FTL stack trace ("~" means nesting-related):
8
+	- Failed at: ${i.title}  [in template "test.ftl" at line 48, column 6]
9
+----
10
+
11
+Java stack trace (for programmers):
12
+----
13
+freemarker.core.NonHashException: [... Exception message was already printed; see it above ...]
14
+	at freemarker.core.Dot._eval(Dot.java:48)
15
+	at freemarker.core.Expression.eval(Expression.java:101)
16
+	at freemarker.core.DollarVariable.calculateInterpolatedStringOrMarkup(DollarVariable.java:100)
17
+	at freemarker.core.DollarVariable.accept(DollarVariable.java:63)
18
+	at freemarker.core.Environment.visit(Environment.java:367)
19
+	at freemarker.core.IteratorBlock$IterationContext.executedNestedContentForCollOrSeqListing(IteratorBlock.java:321)
20
+	at freemarker.core.IteratorBlock$IterationContext.executeNestedContent(IteratorBlock.java:271)
21
+	at freemarker.core.IteratorBlock$IterationContext.accept(IteratorBlock.java:244)
22
+	at freemarker.core.Environment.visitIteratorBlock(Environment.java:643)
23
+	at freemarker.core.IteratorBlock.acceptWithResult(IteratorBlock.java:108)
24
+	at freemarker.core.IteratorBlock.accept(IteratorBlock.java:94)
25
+	at freemarker.core.Environment.visit(Environment.java:331)
26
+	at freemarker.core.Environment.visit(Environment.java:337)
27
+	at freemarker.core.Environment.process(Environment.java:310)
28
+	at freemarker.template.Template.process(Template.java:383)
29
+	at kr.co.swh.lecture.opensource.sparkjava.FreeMarkerTemplateEngine.render(FreeMarkerTemplateEngine.java:26)
30
+	at spark.TemplateViewRouteImpl$1.render(TemplateViewRouteImpl.java:61)
31
+	at spark.TemplateViewRouteImpl$1.render(TemplateViewRouteImpl.java:58)
32
+	at spark.TemplateViewRouteImpl.render(TemplateViewRouteImpl.java:86)
33
+	at spark.http.matching.Routes.execute(Routes.java:63)
34
+	at spark.http.matching.MatcherFilter.doFilter(MatcherFilter.java:134)
35
+	at spark.embeddedserver.jetty.JettyHandler.doHandle(JettyHandler.java:50)
36
+	at org.eclipse.jetty.server.session.SessionHandler.doScope(SessionHandler.java:1568)
37
+	at org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:141)
38
+	at org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:132)
39
+	at org.eclipse.jetty.server.Server.handle(Server.java:530)
40
+	at org.eclipse.jetty.server.HttpChannel.handle(HttpChannel.java:347)
41
+	at org.eclipse.jetty.server.HttpConnection.onFillable(HttpConnection.java:256)
42
+	at org.eclipse.jetty.io.AbstractConnection$ReadCallback.succeeded(AbstractConnection.java:279)
43
+	at org.eclipse.jetty.io.FillInterest.fillable(FillInterest.java:102)
44
+	at org.eclipse.jetty.io.ChannelEndPoint$2.run(ChannelEndPoint.java:124)
45
+	at org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.doProduce(EatWhatYouKill.java:247)
46
+	at org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.produce(EatWhatYouKill.java:140)
47
+	at org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.run(EatWhatYouKill.java:131)
48
+	at org.eclipse.jetty.util.thread.ReservedThreadExecutor$ReservedThread.run(ReservedThreadExecutor.java:382)
49
+	at org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:708)
50
+	at org.eclipse.jetty.util.thread.QueuedThreadPool$2.run(QueuedThreadPool.java:626)
51
+	at java.lang.Thread.run(Thread.java:748)
52
+[16:51:55.912][ERROR][LoggerFactory$Log4jLogger:   59] - Error executing FreeMarker template
53
+FreeMarker template error:
54
+For "." left-hand operand: Expected a hash, but this has evaluated to a string (wrapper: f.t.SimpleScalar):
55
+==> i  [in template "news.ftl" at line 15, column 18]
56
+
57
+----
58
+FTL stack trace ("~" means nesting-related):
59
+	- Failed at: ${i.id}  [in template "news.ftl" at line 15, column 16]
60
+----
61
+
62
+Java stack trace (for programmers):
63
+----
64
+freemarker.core.NonHashException: [... Exception message was already printed; see it above ...]
65
+	at freemarker.core.Dot._eval(Dot.java:48)
66
+	at freemarker.core.Expression.eval(Expression.java:101)
67
+	at freemarker.core.DollarVariable.calculateInterpolatedStringOrMarkup(DollarVariable.java:100)
68
+	at freemarker.core.DollarVariable.accept(DollarVariable.java:63)
69
+	at freemarker.core.Environment.visit(Environment.java:367)
70
+	at freemarker.core.IteratorBlock$IterationContext.executedNestedContentForCollOrSeqListing(IteratorBlock.java:321)
71
+	at freemarker.core.IteratorBlock$IterationContext.executeNestedContent(IteratorBlock.java:271)
72
+	at freemarker.core.IteratorBlock$IterationContext.accept(IteratorBlock.java:244)
73
+	at freemarker.core.Environment.visitIteratorBlock(Environment.java:643)
74
+	at freemarker.core.IteratorBlock.acceptWithResult(IteratorBlock.java:108)
75
+	at freemarker.core.IteratorBlock.accept(IteratorBlock.java:94)
76
+	at freemarker.core.Environment.visit(Environment.java:331)
77
+	at freemarker.core.Environment.visit(Environment.java:337)
78
+	at freemarker.core.Environment.process(Environment.java:310)
79
+	at freemarker.template.Template.process(Template.java:383)
80
+	at kr.co.swh.lecture.opensource.sparkjava.FreeMarkerTemplateEngine.render(FreeMarkerTemplateEngine.java:26)
81
+	at spark.TemplateViewRouteImpl$1.render(TemplateViewRouteImpl.java:61)
82
+	at spark.TemplateViewRouteImpl$1.render(TemplateViewRouteImpl.java:58)
83
+	at spark.TemplateViewRouteImpl.render(TemplateViewRouteImpl.java:86)
84
+	at spark.http.matching.Routes.execute(Routes.java:63)
85
+	at spark.http.matching.MatcherFilter.doFilter(MatcherFilter.java:134)
86
+	at spark.embeddedserver.jetty.JettyHandler.doHandle(JettyHandler.java:50)
87
+	at org.eclipse.jetty.server.session.SessionHandler.doScope(SessionHandler.java:1568)
88
+	at org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:141)
89
+	at org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:132)
90
+	at org.eclipse.jetty.server.Server.handle(Server.java:530)
91
+	at org.eclipse.jetty.server.HttpChannel.handle(HttpChannel.java:347)
92
+	at org.eclipse.jetty.server.HttpConnection.onFillable(HttpConnection.java:256)
93
+	at org.eclipse.jetty.io.AbstractConnection$ReadCallback.succeeded(AbstractConnection.java:279)
94
+	at org.eclipse.jetty.io.FillInterest.fillable(FillInterest.java:102)
95
+	at org.eclipse.jetty.io.ChannelEndPoint$2.run(ChannelEndPoint.java:124)
96
+	at org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.doProduce(EatWhatYouKill.java:247)
97
+	at org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.produce(EatWhatYouKill.java:140)
98
+	at org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:708)
99
+	at org.eclipse.jetty.util.thread.QueuedThreadPool$2.run(QueuedThreadPool.java:626)
100
+	at java.lang.Thread.run(Thread.java:748)

+ 323
- 0
src/main/java/kr/co/swh/lecture/opensource/project/news/NaverNewsCrawlSourceDB.java 查看文件

@@ -0,0 +1,323 @@
1
+package kr.co.swh.lecture.opensource.project.news;
2
+
3
+import java.io.IOException;
4
+import java.sql.Connection;
5
+import java.sql.DriverManager;
6
+import java.sql.PreparedStatement;
7
+import java.sql.SQLException;
8
+import java.util.HashMap;
9
+import java.util.Iterator;
10
+import java.util.LinkedList;
11
+import java.util.List;
12
+import java.util.Map;
13
+import java.util.regex.Matcher;
14
+import java.util.regex.Pattern;
15
+
16
+import org.jsoup.Jsoup;
17
+import org.jsoup.nodes.Document;
18
+import org.jsoup.nodes.Element;
19
+import org.jsoup.select.Elements;
20
+
21
+import com.google.common.collect.Lists;
22
+
23
+import scala.Tuple2;
24
+
25
+
26
+public class NaverNewsCrawlSourceDB 
27
+{
28
+	private String domain, startDate, endDate;
29
+	private String[] category;
30
+	private Boolean fastMode;
31
+	
32
+	private ICrawlService service;
33
+	private final int maxContentsLength = 300000;
34
+	public static LinkedList<String> execTailQueue = new LinkedList<String>();
35
+	
36
+	//	HTML TAG
37
+	public final static String TAG_SCRIPT = "script";
38
+	public final static String META_KEY_CONTENT = "content";
39
+	public final static String TAG_A = "a";
40
+	public final static String TAG_IMG = "img";
41
+	public final static String TAG_HREF = "href";
42
+	public final static String TAG_HTTP = "http";
43
+	
44
+	//	REGX
45
+	public final static String REGX_META_CONTENTS_TITLE = "meta[property=og:title]";
46
+	public final static String REGX_META_CONTENTS_REGDATE = "meta[property=og:regDate]";
47
+	public final static String REGX_META_CONTENTS_PUBDATE = "meta[property=article:published_time]";
48
+	
49
+	//////////////////////////////////////////////////////
50
+	//	NAVER
51
+	//	CATEGORY
52
+	//	정치, 경제, 사회, 문화, 세계, 과학, IT  (except. 연예, 스포츠, 기후)
53
+	public final static String[] NAVER_CATEGORY = {"politics", "economy", "society", "culture", "world", "science", "info-tech", "sports", "entertainment", "weather"};
54
+	public final static Map<String, Tuple2<String, String>> NAVER_CATEGORY_MAP = new HashMap<String, Tuple2<String, String>>();
55
+	static{
56
+		NAVER_CATEGORY_MAP.put(NAVER_CATEGORY[0], new Tuple2<String, String>("sid1=100", "sid2=269"));
57
+		NAVER_CATEGORY_MAP.put(NAVER_CATEGORY[1], new Tuple2<String, String>("sid1=101", "sid2=263"));
58
+		NAVER_CATEGORY_MAP.put(NAVER_CATEGORY[2], new Tuple2<String, String>("sid1=102", "sid2=257"));
59
+		NAVER_CATEGORY_MAP.put(NAVER_CATEGORY[3], new Tuple2<String, String>("sid1=103", "sid2=245"));
60
+		NAVER_CATEGORY_MAP.put(NAVER_CATEGORY[4], new Tuple2<String, String>("sid1=104", "sid2=322"));
61
+		NAVER_CATEGORY_MAP.put(NAVER_CATEGORY[5], new Tuple2<String, String>("sid1=105", "sid2=228"));
62
+		NAVER_CATEGORY_MAP.put(NAVER_CATEGORY[6], new Tuple2<String, String>("sid1=105", "sid2=230"));
63
+		NAVER_CATEGORY_MAP.put(NAVER_CATEGORY[7], new Tuple2<String, String>("sid1=100", "sid2=269"));
64
+		NAVER_CATEGORY_MAP.put(NAVER_CATEGORY[8], new Tuple2<String, String>("sid1=100", "sid2=269"));
65
+		NAVER_CATEGORY_MAP.put(NAVER_CATEGORY[9], new Tuple2<String, String>("sid1=100", "sid2=269"));
66
+	}
67
+	//	URL
68
+	public final static String NAVER_URL_HEAD = "http://news.naver.com";
69
+	public final static String NAVER_FETCH_URL_HEAD = NAVER_URL_HEAD + "/main/list.nhn?%s&%s&mid=shm&mode=LS2D&date=%s&page=%s";	//	sid1, sid2, date, paging
70
+	
71
+	//	NAVER REGX
72
+	public final static String NAVER_REGX_FETCH_SUBURL = "a[href]";
73
+	public final static String NAVER_REGX_FETCH_NEWS_ID = "(?<=(&oid=|\\?oid=)|(&aid=|\\?aid=))[\\d]+";
74
+	
75
+	public final static String NAVER_REGX_CONTENTS_REGDATE_GET = ".sponsor > span";
76
+	public final static String NAVER_REGX_CONTENTS_GET = "#articleBodyContents";
77
+	Connection connection = null;
78
+	PreparedStatement statement = null;
79
+	static final String JDBC_DRIVER = "com.mysql.jdbc.Driver";  
80
+	static final String DB_URL = "jdbc:mysql://dev-swh.ga:3306/market";
81
+
82
+	static final String USERNAME = "root";
83
+	static final String PASSWORD = "swhacademy!";
84
+	
85
+	public void init() {
86
+		domain = "naver";
87
+		category = new String[]{"politics", "economy", "society", "culture", "world", "science", "info-tech", "sports", "entertainment", "weather"};
88
+		startDate = "20190101";
89
+		endDate = "20190131";
90
+		fastMode = true;
91
+		service = new NaverCrawlManager(domain, startDate, endDate, category, fastMode);
92
+			
93
+		try {
94
+			Class.forName(JDBC_DRIVER);
95
+			connection = DriverManager.getConnection(DB_URL,USERNAME,PASSWORD);
96
+			System.out.println("MariaDB 연결.");
97
+			service.crawlling();
98
+		} catch (Exception e) {
99
+			// TODO Auto-generated catch block
100
+			e.printStackTrace();
101
+		}
102
+	}
103
+
104
+	public static void main(String[] args) {
105
+		
106
+		NaverNewsCrawlSourceDB source = new NaverNewsCrawlSourceDB();
107
+		source.init();
108
+//		System.out.println(NaverCrawlManager.totalDayFromCalendar(2019,10,20));
109
+//		System.out.println(NaverCrawlManager.endDayFromTotalDay(2019,10));
110
+//		System.out.println(StringUtils.leftPad(String.valueOf(1), 2, "0"));
111
+//		System.out.println(StringUtils.leftPad(String.valueOf(9), 5, "4"));
112
+	}
113
+	
114
+	class NaverCrawlManager extends A_CrawlClient{
115
+		
116
+		public NaverCrawlManager(String domain, String startDate, String endDate, String[] category, boolean fastMode)
117
+		{
118
+			super(domain, startDate, endDate, category, fastMode);
119
+		}
120
+		
121
+		@Override
122
+		public void fetch(String sourceDate) throws Exception{
123
+			List<String> fetchDateList = getFetchDateList(sourceDate);
124
+			rootBreak:
125
+				for(String crawlDate : fetchDateList){
126
+					currentStartDate = crawlDate;
127
+					for(String category : categories){
128
+						System.out.println(currentStartDate + " : " + category);
129
+						for(int pageIndex = 1; pageIndex < Integer.MAX_VALUE; pageIndex++){
130
+							//	shutdown
131
+							if(articleBreakLatch.getCount() < 1) break rootBreak;
132
+							try {
133
+								
134
+								List<String> findNewsDocList = getArticleURLHeader(pageIndex, category, crawlDate);
135
+								int cnt = findNewsDocList.size();
136
+								if(cnt <= 0) {
137
+									break;
138
+								}
139
+								contentsCrawlling(findNewsDocList, category);
140
+								
141
+								Thread.sleep(sleepTime);
142
+							} catch (IOException | InterruptedException e) {
143
+								// TODO Auto-generated catch block
144
+								throw e;
145
+							}
146
+						}
147
+						
148
+					}
149
+					cacheNewsId.clear();
150
+				}
151
+		}
152
+		
153
+		@Override
154
+		public void monitor(){
155
+			new Thread(new Runnable(){
156
+				@Override
157
+				public void run() {
158
+					// TODO Auto-generated method stub
159
+					while(true){
160
+						String currentSystemDate = currentDate();
161
+						
162
+						List<String> monitorArticleList = getStringFormatDateList(monitoringStartDate, currentSystemDate);
163
+						for(String crawlDate : monitorArticleList){
164
+							currentStartDate = crawlDate;
165
+							for(String category : categories){
166
+								int checkPageBlockNum = monitoringNewPageBlockNum;
167
+								for(int pageIndex = 1; pageIndex < Integer.MAX_VALUE; pageIndex++){
168
+									try {
169
+										List<String> findNewsDocList = getArticleURLHeader(pageIndex, category, crawlDate);
170
+										if(findNewsDocList.size() <= 0) {
171
+											checkPageBlockNum--;
172
+										}
173
+										else checkPageBlockNum = monitoringNewPageBlockNum;
174
+										
175
+										if(checkPageBlockNum <= 0) {
176
+											checkPageBlockNum = monitoringNewPageBlockNum;
177
+											break;
178
+										}
179
+										contentsCrawlling(findNewsDocList, category);
180
+										
181
+										Thread.sleep(sleepTime);
182
+									} catch (IOException | InterruptedException e) {
183
+										// TODO Auto-generated catch block
184
+										articleBreakLatch.countDown();
185
+									}
186
+								}
187
+							}
188
+						}
189
+						if(updateSystemDateMonitoringDate(currentSystemDate)) cacheNewsId.clear();
190
+						if(!isContinueCrawl()) {
191
+							articleBreakLatch.countDown();
192
+						}
193
+					}
194
+				}
195
+			}).start();
196
+			
197
+			try {
198
+				articleBreakLatch.await();
199
+			} catch (InterruptedException e) {
200
+				// TODO Auto-generated catch block
201
+			} finally {
202
+			}
203
+		}
204
+		
205
+		private List<String> getArticleURLHeader(int pageIndex, String category, String crawlDate) throws IOException{
206
+			List<String> findNewsDocList = Lists.newArrayList();
207
+			String page = String.valueOf(pageIndex);
208
+			Tuple2<String, String> seedKey = NAVER_CATEGORY_MAP.get(category);
209
+			String fetchURL = String.format(NAVER_FETCH_URL_HEAD, seedKey._1(), seedKey._2(), crawlDate, page);
210
+			Document fetchData = Jsoup.connect(fetchURL).timeout(0).get();
211
+			Elements fetchTmpDatas = fetchData.select(NAVER_REGX_FETCH_SUBURL);
212
+			for(Element doc : fetchTmpDatas){
213
+				if(!doc.select(TAG_A).is(TAG_IMG) && 
214
+						doc.attr(TAG_HREF).startsWith(TAG_HTTP) &&
215
+						doc.attr(TAG_HREF).contains(seedKey._1()) &&
216
+						doc.attr(TAG_HREF).contains(seedKey._2())){
217
+					
218
+					//	네이버에서는 페이지 카운트에 대한 종료를 위해서는 페이지의 주소 페이지 주소로 종료
219
+					if(cacheNewsId.contains(doc.attr(TAG_HREF))) continue;
220
+					cacheNewsId.add(doc.attr(TAG_HREF));
221
+					findNewsDocList.add(doc.attr(TAG_HREF));
222
+				}
223
+			}
224
+			return findNewsDocList;
225
+		}
226
+		
227
+		//	CONTENTS GET
228
+		private void contentsCrawlling(List<String> findNewsDocList, String category) throws IOException{
229
+			int retry = 3;
230
+			for(int index = 0; index < findNewsDocList.size(); index++){
231
+				String newsDocUrl = findNewsDocList.get(index);
232
+				Document crawlDoc;
233
+				try {
234
+					crawlDoc = Jsoup.connect(newsDocUrl).timeout(0).get();
235
+				} catch (IOException e) {
236
+					// TODO Auto-generated catch block
237
+					if(retry < 0) throw e;
238
+					index--;
239
+					continue;
240
+				}
241
+
242
+				//	newsId
243
+				String newsId = "";
244
+				Pattern pattern = Pattern.compile(NAVER_REGX_FETCH_NEWS_ID, Pattern.CASE_INSENSITIVE);
245
+				Matcher m = pattern.matcher(crawlDoc.baseUri());
246
+				int findIndex = 0;
247
+				while(m.find(findIndex)){
248
+					newsId += crawlDoc.baseUri().substring(m.start(), m.end());
249
+					newsId += "-";
250
+					findIndex = m.end();
251
+				}
252
+				newsId = newsId.substring(0, newsId.length()-1);
253
+				
254
+				//	title
255
+				String title = "";
256
+				Elements metaOgTitle = crawlDoc.select(REGX_META_CONTENTS_TITLE);
257
+				if (metaOgTitle!=null) {
258
+					title = metaOgTitle.attr(META_KEY_CONTENT);
259
+				}
260
+				
261
+				//	regDate
262
+				String regDate = "";
263
+				Elements metaOgRegDate = crawlDoc.select(NAVER_REGX_CONTENTS_REGDATE_GET);
264
+				if (metaOgRegDate != null) {
265
+					regDate = metaOgRegDate.text();
266
+				}
267
+
268
+				StringBuilder contentsBuilder = new StringBuilder();
269
+				Elements articleBodyContentsRows = crawlDoc.select(NAVER_REGX_CONTENTS_GET);
270
+				String contents;
271
+				int docContentsLength = 0;
272
+				boolean contentsStatus = false;
273
+				loop:
274
+				for (Element articleBodyContentsRow : articleBodyContentsRows) {
275
+					Iterator<Element> iterElem = articleBodyContentsRow.getAllElements().iterator();
276
+					while(iterElem.hasNext()){
277
+						String text = iterElem.next().text();
278
+						docContentsLength += text.length();
279
+						contentsBuilder.append(text);
280
+						if(docContentsLength > maxContentsLength){
281
+							contentsStatus = true;
282
+							break loop;
283
+						}
284
+					}
285
+				}
286
+				if(contentsStatus) contents = contentsBuilder.toString().substring(0, maxContentsLength);
287
+				else contents = contentsBuilder.toString();
288
+				
289
+				try {
290
+					if(connection.isClosed()) {
291
+						connection = DriverManager.getConnection(DB_URL,USERNAME,PASSWORD);
292
+					}
293
+					String sql = "insert into stock_news(id,category,url,title,regdate,contents) values(?,?,?,?,?,?);";
294
+					
295
+					connection.setAutoCommit(false);
296
+
297
+					statement = connection.prepareStatement(sql);
298
+					statement.setString(1, newsId);
299
+					statement.setString(2, category);
300
+					statement.setString(3, crawlDoc.baseUri());
301
+					statement.setString(4, title);
302
+					statement.setString(5, regDate);
303
+					statement.setString(6, contents);
304
+					statement.executeUpdate();
305
+					
306
+					connection.commit();
307
+					
308
+				} catch (SQLException e) {
309
+					// TODO Auto-generated catch block
310
+//					e.printStackTrace();
311
+				}
312
+			}
313
+		}
314
+		
315
+		@Override
316
+		public void shutDown(){
317
+			super.shutDown();
318
+		}
319
+		
320
+	}
321
+
322
+}
323
+