22
33namespace FriendsOfRedaxo \DiffDetect ;
44
5+ use Exception ;
56use Html2Text \Html2Text ;
67use InvalidArgumentException ;
78use rex ;
1011use rex_instance_pool_trait ;
1112use rex_sql ;
1213use rex_sql_exception ;
13- use voku \helper \HtmlDomParser ;
14+
15+ use function is_array ;
16+ use function sprintf ;
1417
1518final class Index
1619{
@@ -27,7 +30,7 @@ private function __construct(int $id)
2730 }
2831
2932 /**
30- * @return null| static
33+ * @return static|null
3134 */
3235 public static function get (int $ id ): ?self
3336 {
@@ -103,18 +106,28 @@ private static function fromSqlData(array $data): self
103106 public static function createSnapshot (Url $ url ): bool
104107 {
105108 $ url ->setLastScan ();
106- $ response = $ url ->getContent ();
107- $ content = $ response ->getBody ();
109+ $ content = '' ;
110+ $ headers = [];
111+ try {
112+ $ response = $ url ->getResponse ();
113+ $ content = $ response ['Content ' ] ?? '' ;
114+ $ headers = $ response ['Headers ' ] ?? [];
115+ $ statusCode = $ response ['StatusCode ' ] ?? 0 ;
116+ $ statusMessage = '[ ' . $ statusCode . '] OK ' ;
117+ } catch (Exception $ e ) {
118+ $ statusCode = $ e ->getCode ();
119+ $ statusMessage = '[ ' . $ statusCode . '] ' . $ e ->getMessage ();
120+ }
121+
122+ $ url ->setLastMessage ($ statusMessage );
123+
124+ $ headers = self ::flattenArray ($ headers );
108125
109126 if ('HTML ' === $ url ->getType ()) {
110- $ content = preg_replace ('/<script\b[^>]*>(.*?)<\/script>/is ' , '' , $ content );
111- $ content = preg_replace ('/<style\b[^>]*>(.*?)<\/style>/is ' , '' , $ content );
112- $ content = preg_replace ('/<noscript\b[^>]*>(.*?)<\/noscript>/is ' , '' , $ content );
113- $ content = strip_tags ($ content , ['img ' , 'video ' ]);
127+ $ content = (new Html2Text ($ content ))->getText ();
114128 }
115129
116130 $ hash = md5 ($ content );
117-
118131 $ sql = rex_sql::factory ();
119132 $ sql ->setTable (rex::getTable ('diff_detect_index ' ));
120133 $ sql ->setWhere ('url_id = ? ORDER BY createdate DESC LIMIT 1 ' , [$ url ->getId ()]);
@@ -132,11 +145,11 @@ public static function createSnapshot(Url $url): bool
132145 $ sql ->addGlobalCreateFields ();
133146 $ sql ->addGlobalUpdateFields ();
134147 $ sql ->setValue ('url_id ' , $ url ->getId ());
135- $ sql ->setValue ('content ' , $ response -> getBody () );
148+ $ sql ->setValue ('content ' , $ content );
136149 $ sql ->setValue ('hash ' , $ hash );
137- $ sql ->setValue ('header ' , $ response -> getHeader ( ));
138- $ sql ->setValue ('statusCode ' , $ response -> getStatusCode () );
139- $ sql ->setValue ('statusMessage ' , $ response -> getStatusMessage () );
150+ $ sql ->setValue ('header ' , implode ( ' , ' , $ headers ));
151+ $ sql ->setValue ('statusCode ' , $ statusCode );
152+ $ sql ->setValue ('statusMessage ' , $ statusMessage );
140153 $ sql ->insert ();
141154
142155 return true ;
@@ -167,6 +180,8 @@ public static function cleanUpSnapshots(): void
167180 WHERE
168181 url_id = :url_id
169182 AND createdate < DATE_SUB(:datetime, INTERVAL :interval MINUTE)
183+ ORDER BY createdate ASC
184+ LIMIT 100
170185 ' , [
171186 'url_id ' => $ URL ->getId (),
172187 'datetime ' => date (rex_sql::FORMAT_DATETIME ),
@@ -176,6 +191,7 @@ public static function cleanUpSnapshots(): void
176191 foreach ($ indeces as $ Index ) {
177192 $ Index = self ::fromSqlData ($ Index );
178193 $ Index ->delete ();
194+ echo ' Deleted index with ID: ' . $ Index ->getId () . ' for URL: ' . $ URL ->getName () . "\n" ;
179195 }
180196 }
181197 }
@@ -194,14 +210,7 @@ public function getUrl(): ?Url
194210
195211 public function getContent (): string
196212 {
197- if ('RSS ' === $ this ->url ?->getType()) {
198- return $ this ->getValue ('content ' );
199- }
200-
201- $ content = $ this ->getValue ('content ' );
202- // $content = HtmlDomParser::str_get_html($content)->findOne('#content')->innerHtml();
203- $ content = (new Html2Text ($ content ))->getText ();
204- return $ content ;
213+ return $ this ->getValue ('content ' );
205214 }
206215
207216 public function delete (): void
@@ -220,4 +229,21 @@ public function delete(): void
220229 throw new rex_exception ($ sql ->getError ());
221230 }
222231 }
232+
233+ private static function flattenArray ($ array , $ prefix = '' )
234+ {
235+ $ result = [];
236+
237+ foreach ($ array as $ key => $ value ) {
238+ $ newKey = '' === $ prefix ? $ key : $ prefix . '. ' . $ key ;
239+
240+ if (is_array ($ value )) {
241+ $ result = array_merge ($ result , self ::flattenArray ($ value , $ newKey ));
242+ } else {
243+ $ result [$ newKey ] = $ value ;
244+ }
245+ }
246+
247+ return $ result ;
248+ }
223249}
0 commit comments