Skip to content

Commit 100f21a

Browse files
Refactor Readability parameters to use named arguments and enhance regex support in domain filtering; add tests for regex functionality with demo HTML fixture
1 parent 96eb67f commit 100f21a

File tree

4 files changed

+30
-11
lines changed

4 files changed

+30
-11
lines changed

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,8 +64,8 @@ use The3LabsTeam\LaravelReadability\Readability as ReadabilityClass;
6464
$html = '<html>...</html>';
6565
$parse = (new ReadabilityClass($html))
6666
->getSourceList(
67-
$domainWhitelist = ['example.com', 'another-example.com/some-path'],
68-
$tagsToExtract = ['a', 'iframe', 'text'] // Optional, default is ['a', 'iframe']
67+
domainWhitelist: ['example.com', 'another-example.com/some-path'],
68+
tagsToExtract: ['a', 'iframe', 'text'] // Optional, default is ['a', 'iframe']
6969
)
7070
->parse();
7171
$content = $parsed->getContent();

src/Readability.php

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,7 @@ public function getDirection(): string
132132
/**
133133
* Estrae i link da tag specifici e dal testo, filtrando per dominio
134134
*
135-
* @param array $domainWhitelist
135+
* @param array $domainWhitelist - List of domains to whitelist, can be regex patterns or simple strings
136136
* @param array $tagsToExtract ['a', 'iframe', 'img', 'text']
137137
* @return $this
138138
*/
@@ -172,18 +172,23 @@ public function addSourceList(array $domainWhitelist = [], array $tagsToExtract
172172
}
173173
}
174174

175-
//Rimuovi i link che non hanno https o http
176-
$links = array_filter($links, fn($link) =>
177-
preg_match('#^(https?://|www\.)#i', $link)
178-
);
179-
180175
if (!empty($domainWhitelist)) {
181176
$filtered = [];
182177
foreach ($links as $href) {
183178
foreach ($domainWhitelist as $domain) {
184-
if (stripos($href, $domain) !== false) {
185-
$filtered[] = $href;
186-
break;
179+
// Check if it's a regex pattern (starts and ends with /)
180+
if (preg_match('/^\/.*\/$/', $domain)) {
181+
// It's a regex pattern
182+
if (preg_match($domain, $href)) {
183+
$filtered[] = $href;
184+
break;
185+
}
186+
} else {
187+
// It's a regular domain string
188+
if (stripos($href, $domain) !== false) {
189+
$filtered[] = $href;
190+
break;
191+
}
187192
}
188193
}
189194
}

tests/ReadabilityTest.php

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
use The3LabsTeam\Readability\Facades\Readability;
44
use The3LabsTeam\Readability\Readability as ReadabilityClass;
55

6+
//./vendor/bin/pest tests/ReadabilityTest.php
7+
68
it('cannot parse a non html', function () {
79
$html = 'This is a test';
810
expect(function () use ($html) {
@@ -157,4 +159,12 @@
157159
expect($content)->toContain('https://satoshi.nakamotoinstitute.org');
158160
expect($content)->toContain('https://www.facebook.com/v3.2/plugins/post');
159161
expect($content)->toContain('https://www.bitcoin.com/wp-content/uploads/2020/10/bitcoin-whitepaper-featured-image.jpg');
162+
});
163+
164+
it('can parse and get the content with source list (regex)', function () {
165+
$html = file_get_contents(__DIR__.'/fixtures/demo-with-iframe.html');
166+
$content = (new ReadabilityClass($html))->addSourceList(['/twitter\.com\/.*\/status\/\d+/'])->parse()->getContent();
167+
expect($content)->not->toBeNull();
168+
expect($content)->toContain('https://twitter.com/TomsHWItalia/status/1927702682380149224?ref_src=twsrc%5Etfw');
169+
expect($content)->not->toContain('https://twitter.com/DA_NON_PRENDERE');
160170
});

tests/fixtures/demo-with-iframe.html

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,10 @@ <h4>October 31, 2008</h4>
128128
<iframe class="" style="border: none; width: 100%; height: 478px; visibility: visible;" title="fb:post Facebook Social Plugin" src="https://www.facebook.com/v3.2/plugins/post.php?app_id=&amp;channel=https%3A%2F%2Fstaticxx.facebook.com%2Fx%2Fconnect%2Fxd_arbiter%2F%3Fversion%3D46%23cb%3Df59d0d8549941a612%26domain%3Dtomshardware.test%26is_canvas%3Dfalse%26origin%3Dhttp%253A%252F%252Ftomshardware.test%252Ff882f8e4004a0c055%26relation%3Dparent.parent&amp;container_width=834&amp;href=https%3A%2F%2Fwww.facebook.com%2Ftomshardware.it%2Fposts%2Fpfbid09kerg3CPZNbDVZvTJeqBKVjuDYuXwNTRNtV2zycQD1BrJ5tvbVy6FLPLkaz4FR5nl&amp;lazy=true&amp;locale=en_US&amp;sdk=joey&amp;show_text=true&amp;width=auto" name="f1b548f199a063ba9" height="1000px" frameborder="0" scrolling="no" allow="encrypted-media" allowfullscreen="allowfullscreen" loading="lazy" data-testid="fb:post Facebook Social Plugin"></iframe>
129129

130130
<iframe class="" src="https://www.facebook.com/posts/18dwdwxcowmkd"></iframe>
131+
<iframe class="" src="https://twitter.com/TomsHWItalia/status/1927702682380149224?ref_src=twsrc%5Etfw"></iframe>
132+
<iframe class="" src="https://twitter.com/DA_NON_PRENDERE?ref_src=twsrc%5Etfw"></iframe>
133+
134+
131135

132136
<img class="img-fluid d-block mx-auto my-4" src="https://www.bitcoin.com/wp-content/uploads/2020/10/bitcoin-whitepaper-featured-image.jpg">
133137

0 commit comments

Comments
 (0)