Skip to content

Commit 21f2bed

Browse files
authored
Merge pull request #43 from FriendsOfREDAXO/socket_to_httpclient
rex_socket to http client - mehr debugging bei fehlern
2 parents 0ce2a46 + 25a7e98 commit 21f2bed

File tree

1,636 files changed

+85369
-25664
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,636 files changed

+85369
-25664
lines changed

composer.json

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,13 @@
11
{
22
"license": "MIT",
33
"require": {
4-
"jfcherng/php-diff": "^6.15",
4+
"jfcherng/php-diff": "^6",
55
"html2text/html2text": "^4.3",
66
"voku/simple_html_dom": "^4.8",
7-
"laminas/laminas-feed": "^2.20",
7+
"laminas/laminas-feed": "^2",
88
"mkalkbrenner/php-htmldiff-advanced": "^0.0.8",
9-
"ezyang/htmlpurifier": "^4.16"
9+
"ezyang/htmlpurifier": "*",
10+
"symfony/http-client": "^7.3"
1011
},
1112
"replace": {
1213
"psr/container": "*",

composer.lock

Lines changed: 1971 additions & 1078 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

install.php

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
->ensureColumn(new rex_sql_column('categories', 'text', false, ''))
1616
->ensureColumn(new rex_sql_column('status', 'tinyint', false, '0'))
1717
->ensureColumn(new rex_sql_column('interval', 'int(10) unsigned', false, '1440'))
18+
->ensureColumn(new rex_sql_column('last_message', 'text', false, ''))
1819
->ensureColumn(new rex_sql_column('http_auth_login', 'VARCHAR(100)', true))
1920
->ensureColumn(new rex_sql_column('http_auth_password', 'VARCHAR(100)', true))
2021
->ensureColumn(new rex_sql_column('last_scan', 'datetime', true))

lang/de_de.lang

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ diff_detect_snapshot_error = Snapshot von `{0}` konnte nicht erstellt werden. Fe
7171
back_to_snapshots = Zurück zu Snapshots
7272
last_scan = Letzter Scanversuch
7373
last_snapshot = Letzter Snapshot
74+
last_message = Letzte Scaninfo
7475

7576
interval_in_min_5 = 5 Minuten
7677
interval_in_min_15 = 15 Minuten

lib/Command/Console.php

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
<?php
2+
3+
namespace FriendsOfRedaxo\DiffDetect\Command;
4+
5+
use FriendsOfRedaxo\DiffDetect\Index;
6+
use FriendsOfRedaxo\DiffDetect\Url;
7+
use rex;
8+
use rex_console_command;
9+
use rex_exception;
10+
use rex_sql;
11+
use Symfony\Component\Console\Input\InputInterface;
12+
use Symfony\Component\Console\Output\OutputInterface;
13+
14+
use function count;
15+
16+
class Console extends rex_console_command
17+
{
18+
protected function execute(InputInterface $input, OutputInterface $output): int
19+
{
20+
$io = $this->getStyle($input, $output);
21+
$io->title('DiffDetect Console Command');
22+
23+
$sql = rex_sql::factory();
24+
$URLs = $sql->getArray(
25+
'
26+
SELECT u.*
27+
, i.createdate AS last_index_date
28+
FROM ' . rex::getTable('diff_detect_url') . ' u
29+
LEFT JOIN (
30+
SELECT url_id, MAX(createdate) createdate
31+
FROM ' . rex::getTable('diff_detect_index') . '
32+
GROUP BY url_id
33+
) i
34+
ON u.id = i.url_id
35+
WHERE u.status = 1
36+
AND (
37+
u.last_scan IS NULL
38+
OR u.last_scan < DATE_SUB(:datetime, INTERVAL u.interval MINUTE)
39+
)
40+
order by u.last_scan
41+
LIMIT 5
42+
',
43+
[
44+
'datetime' => date(rex_sql::FORMAT_DATETIME),
45+
],
46+
);
47+
48+
foreach ($URLs as $URLArray) {
49+
$Url = Url::get($URLArray['id']);
50+
try {
51+
if (Index::createSnapshot($Url)) {
52+
$io->success('Snapshot created for URL: ' . $Url->getName() . ' / ' . $Url->getUrl() . ' [' . $Url->getId() . ']');
53+
} else {
54+
$io->success('Snapshot NOT created for URL: ' . $Url->getName() . ' / ' . $Url->getUrl() . ' [' . $Url->getId() . ']');
55+
}
56+
} catch (rex_exception $e) {
57+
$io->error('Snapshot error for URL: ' . $Url->getName() . ' / ' . $Url->getUrl() . ' [' . $Url->getId() . ']');
58+
break;
59+
}
60+
}
61+
62+
if (0 === count($URLs)) {
63+
$io->info('no snapshots');
64+
}
65+
66+
$io->text('Total URLs processed: ' . count($URLs));
67+
$io->text('');
68+
69+
Index::cleanUpSnapshots();
70+
71+
return 1;
72+
}
73+
}

lib/Index.php

Lines changed: 47 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
namespace FriendsOfRedaxo\DiffDetect;
44

5+
use Exception;
56
use Html2Text\Html2Text;
67
use InvalidArgumentException;
78
use rex;
@@ -10,7 +11,9 @@
1011
use rex_instance_pool_trait;
1112
use rex_sql;
1213
use rex_sql_exception;
13-
use voku\helper\HtmlDomParser;
14+
15+
use function is_array;
16+
use function sprintf;
1417

1518
final class Index
1619
{
@@ -27,7 +30,7 @@ private function __construct(int $id)
2730
}
2831

2932
/**
30-
* @return null|static
33+
* @return static|null
3134
*/
3235
public static function get(int $id): ?self
3336
{
@@ -103,18 +106,28 @@ private static function fromSqlData(array $data): self
103106
public static function createSnapshot(Url $url): bool
104107
{
105108
$url->setLastScan();
106-
$response = $url->getContent();
107-
$content = $response->getBody();
109+
$content = '';
110+
$headers = [];
111+
try {
112+
$response = $url->getResponse();
113+
$content = $response['Content'] ?? '';
114+
$headers = $response['Headers'] ?? [];
115+
$statusCode = $response['StatusCode'] ?? 0;
116+
$statusMessage = '[' . $statusCode . '] OK';
117+
} catch (Exception $e) {
118+
$statusCode = $e->getCode();
119+
$statusMessage = '[' . $statusCode . '] ' . $e->getMessage();
120+
}
121+
122+
$url->setLastMessage($statusMessage);
123+
124+
$headers = self::flattenArray($headers);
108125

109126
if ('HTML' === $url->getType()) {
110-
$content = preg_replace('/<script\b[^>]*>(.*?)<\/script>/is', '', $content);
111-
$content = preg_replace('/<style\b[^>]*>(.*?)<\/style>/is', '', $content);
112-
$content = preg_replace('/<noscript\b[^>]*>(.*?)<\/noscript>/is', '', $content);
113-
$content = strip_tags($content, ['img', 'video']);
127+
$content = (new Html2Text($content))->getText();
114128
}
115129

116130
$hash = md5($content);
117-
118131
$sql = rex_sql::factory();
119132
$sql->setTable(rex::getTable('diff_detect_index'));
120133
$sql->setWhere('url_id = ? ORDER BY createdate DESC LIMIT 1', [$url->getId()]);
@@ -132,11 +145,11 @@ public static function createSnapshot(Url $url): bool
132145
$sql->addGlobalCreateFields();
133146
$sql->addGlobalUpdateFields();
134147
$sql->setValue('url_id', $url->getId());
135-
$sql->setValue('content', $response->getBody());
148+
$sql->setValue('content', $content);
136149
$sql->setValue('hash', $hash);
137-
$sql->setValue('header', $response->getHeader());
138-
$sql->setValue('statusCode', $response->getStatusCode());
139-
$sql->setValue('statusMessage', $response->getStatusMessage());
150+
$sql->setValue('header', implode(',', $headers));
151+
$sql->setValue('statusCode', $statusCode);
152+
$sql->setValue('statusMessage', $statusMessage);
140153
$sql->insert();
141154

142155
return true;
@@ -167,6 +180,8 @@ public static function cleanUpSnapshots(): void
167180
WHERE
168181
url_id = :url_id
169182
AND createdate < DATE_SUB(:datetime, INTERVAL :interval MINUTE)
183+
ORDER BY createdate ASC
184+
LIMIT 100
170185
', [
171186
'url_id' => $URL->getId(),
172187
'datetime' => date(rex_sql::FORMAT_DATETIME),
@@ -176,6 +191,7 @@ public static function cleanUpSnapshots(): void
176191
foreach ($indeces as $Index) {
177192
$Index = self::fromSqlData($Index);
178193
$Index->delete();
194+
echo ' Deleted index with ID: ' . $Index->getId() . ' for URL: ' . $URL->getName() . "\n";
179195
}
180196
}
181197
}
@@ -194,14 +210,7 @@ public function getUrl(): ?Url
194210

195211
public function getContent(): string
196212
{
197-
if ('RSS' === $this->url?->getType()) {
198-
return $this->getValue('content');
199-
}
200-
201-
$content = $this->getValue('content');
202-
// $content = HtmlDomParser::str_get_html($content)->findOne('#content')->innerHtml();
203-
$content = (new Html2Text($content))->getText();
204-
return $content;
213+
return $this->getValue('content');
205214
}
206215

207216
public function delete(): void
@@ -220,4 +229,21 @@ public function delete(): void
220229
throw new rex_exception($sql->getError());
221230
}
222231
}
232+
233+
private static function flattenArray($array, $prefix = '')
234+
{
235+
$result = [];
236+
237+
foreach ($array as $key => $value) {
238+
$newKey = '' === $prefix ? $key : $prefix . '.' . $key;
239+
240+
if (is_array($value)) {
241+
$result = array_merge($result, self::flattenArray($value, $newKey));
242+
} else {
243+
$result[$newKey] = $value;
244+
}
245+
}
246+
247+
return $result;
248+
}
223249
}

lib/RssDiff.php

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ public function calculate(): string
4242
$itemsAfter = $this->getItems($this->after);
4343

4444
$output = '';
45-
/** @var \Laminas\Feed\Reader\Entry\Rss $item */
45+
/** @var Rss $item */
4646
foreach ($itemsBefore as $id => $item) {
4747
if (array_key_exists($id, $itemsAfter)) {
4848
$diff = HtmlDiffAdvanced::create($this->renderItem($item), $this->renderItem($itemsAfter[$id]));

lib/Url.php

Lines changed: 60 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,15 @@
22

33
namespace FriendsOfRedaxo\DiffDetect;
44

5+
use Exception;
56
use InvalidArgumentException;
67
use rex;
78
use rex_instance_pool_trait;
8-
use rex_socket;
9-
use rex_socket_response;
109
use rex_sql;
1110
use rex_sql_exception;
11+
use Symfony\Component\HttpClient\HttpClient;
12+
13+
use function sprintf;
1214

1315
final class Url
1416
{
@@ -26,7 +28,7 @@ private function __construct(int $id)
2628
}
2729

2830
/**
29-
* @return null|static
31+
* @return static|null
3032
*/
3133
public static function get(int $id): ?self
3234
{
@@ -115,31 +117,60 @@ private static function fromSqlData(array $data): self
115117
return $dataset;
116118
}
117119

118-
public function getContent(): rex_socket_response
120+
public function getResponse(): array
119121
{
120-
$socket = rex_socket::factoryUrl($this->getValue('url'));
121-
$socket->acceptCompression();
122-
$socket->followRedirects(self::$maxRedirects);
123-
$socket->setTimeout(self::$timeout);
122+
$Options = [
123+
'timeout' => self::$timeout,
124+
'max_redirects' => self::$maxRedirects,
125+
'headers' => [
126+
'User-Agent' => 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
127+
// 'Accept-Encoding' => 'gzip, deflate, br',
128+
],
129+
];
124130

125131
$login = $this->getValue('http_auth_login');
126132
$password = $this->getValue('http_auth_password');
133+
if ('' !== $login && '' !== $password) {
134+
$Options['headers']['Authorization'] = 'Basic ' . base64_encode($login . ':' . $password);
135+
}
136+
137+
$client = HttpClient::create();
138+
$response = $client->request('GET', $this->getValue('url'), $Options);
139+
140+
$stream = $client->stream($response);
127141

128-
if ('' === $login && '' === $password) {
129-
$socket->addBasicAuthorization($login, $password);
142+
$content = '';
143+
foreach ($stream as $chunk) {
144+
$content .= $chunk->getContent();
130145
}
131146

132-
$response = $socket->doGet();
133-
$cookie = $response->getHeader('Set-Cookie');
147+
$headers = $response->getHeaders();
148+
$statusCode = $response->getStatusCode();
134149

135-
if (null !== $cookie) {
136-
// separate cookie value from optional attributes
137-
[$cookieValue] = explode(';', $cookie);
138-
$socket->addHeader('Cookie', $cookieValue);
139-
$response = $socket->doGet();
150+
// $cookie = $response->getHeader('Set-Cookie');
151+
//
152+
// if (null !== $cookie) {
153+
// // separate cookie value from optional attributes
154+
// [$cookieValue] = explode(';', $cookie);
155+
// $socket->addHeader('Cookie', $cookieValue);
156+
// $response = $socket->doGet();
157+
// }
158+
159+
if (200 !== $response->getStatusCode()) {
160+
throw new Exception(sprintf('Failed to fetch content from URL "%s". HTTP status code: %d', $this->getValue('url'), $response->getStatusCode()));
161+
}
162+
163+
if ('' === $content) {
164+
throw new Exception(sprintf('No content received from URL "%s".', $this->getValue('url')));
140165
}
141166

142-
return $response->decompressContent(true);
167+
unset($response);
168+
169+
return [
170+
'Content' => $content,
171+
'Headers' => $headers,
172+
'StatusCode' => $statusCode,
173+
];
143174
}
144175

145176
public function getType(): string
@@ -178,4 +209,15 @@ public function setLastScan(): void
178209
],
179210
);
180211
}
212+
213+
public function setLastMessage(string $message): void
214+
{
215+
rex_sql::factory()->setQuery(
216+
'update ' . rex::getTable('diff_detect_url') . ' set last_message = :last_message where id = :id',
217+
[
218+
'id' => $this->getId(),
219+
'last_message' => $message,
220+
],
221+
);
222+
}
181223
}

0 commit comments

Comments
 (0)