Skip to content

Commit 96eb67f

Browse files
Add tests for ReadabilityClass and include demo-with-iframe.html fixture
1 parent 783b779 commit 96eb67f

File tree

4 files changed

+636
-4
lines changed

4 files changed

+636
-4
lines changed

README.md

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,8 @@ Please change the version according to your PHP version.
3333
## Usage
3434

3535
```php
36+
use The3LabsTeam\LaravelReadability\Facades\Readability;
37+
3638
$html = '<html>...</html>';
3739
$parsed = Readability::parse($html);
3840
$title = $parsed->getTitle();
@@ -54,6 +56,29 @@ $image = $parsed->getImage();
5456
$images = $parsed->getImages();
5557
```
5658

59+
### Get the source list
60+
61+
```php
62+
use The3LabsTeam\LaravelReadability\Readability as ReadabilityClass;
63+
64+
$html = '<html>...</html>';
65+
$parse = (new ReadabilityClass($html))
66+
->getSourceList(
67+
$domainWhitelist = ['example.com', 'another-example.com/some-path'],
68+
$tagsToExtract = ['a', 'iframe', 'text'] // Optional, default is ['a', 'iframe']
69+
)
70+
->parse();
71+
$content = $parsed->getContent();
72+
```
73+
74+
´$content´ will contain the list of sources in the article.
75+
76+
```html
77+
...
78+
<p>Source list: https://example.com/source1, https://example.com/source2</p>
79+
```
80+
81+
5782
## Testing
5883

5984
```bash

src/Readability.php

Lines changed: 87 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,18 +9,27 @@
99

1010
class Readability
1111
{
12-
private ?FFReadability $content;
12+
private ?FFReadability $content = null;
13+
public ?string $rawHtml = null;
14+
public array $sourceList = [];
15+
16+
public function __construct(?string $rawHtml = null)
17+
{
18+
$this->rawHtml = $rawHtml ?? '';
19+
20+
return $this;
21+
}
1322

1423
/**
1524
* Parse the content
1625
*
1726
* @throws Exception
1827
*/
19-
public function parse(string $content): self
28+
public function parse(?string $content = null): self
2029
{
2130
$this->content = new FFReadability(new Configuration());
2231
try {
23-
$this->content->parse($content);
32+
$this->content->parse($content ?? $this->rawHtml);
2433
} catch (ParseException $e) {
2534
$this->content = null;
2635
error_log('Cannot parse: '.$e->getMessage());
@@ -98,7 +107,14 @@ public function getContent(): string
98107
{
99108
$this->checkContent();
100109

101-
return $this->content->getContent();
110+
$content = $this->content->getContent();
111+
112+
if(!empty($this->sourceList)) {
113+
$sourceLinksHtml = '<p>Source list: ' . implode(', ', $this->sourceList) . '</p>';
114+
$content .= "\n\n" . $sourceLinksHtml;
115+
}
116+
117+
return $content;
102118
}
103119

104120
/**
@@ -113,6 +129,73 @@ public function getDirection(): string
113129
return $this->content->getDirection();
114130
}
115131

132+
/**
133+
* Estrae i link da tag specifici e dal testo, filtrando per dominio
134+
*
135+
* @param array $domainWhitelist
136+
* @param array $tagsToExtract ['a', 'iframe', 'img', 'text']
137+
* @return $this
138+
*/
139+
public function addSourceList(array $domainWhitelist = [], array $tagsToExtract = ['a', 'iframe']): self
140+
{
141+
$dom = new \DOMDocument();
142+
libxml_use_internal_errors(true);
143+
$dom->loadHTML($this->rawHtml);
144+
libxml_clear_errors();
145+
$links = [];
146+
147+
if (in_array('a', $tagsToExtract)) {
148+
foreach ($dom->getElementsByTagName('a') as $a) {
149+
$href = $a->getAttribute('href');
150+
if ($href) {
151+
$links[] = $href;
152+
}
153+
}
154+
}
155+
if (in_array('iframe', $tagsToExtract)) {
156+
foreach ($dom->getElementsByTagName('iframe') as $iframe) {
157+
$src = $iframe->getAttribute('src');
158+
if ($src) {
159+
$links[] = $src;
160+
}
161+
}
162+
}
163+
164+
if (in_array('text', $tagsToExtract)) {
165+
$text = $dom->textContent;
166+
if ($text) {
167+
if (preg_match_all('#\b((https?://|www\.)[^\s<>"]+)#i', $text, $matches)) {
168+
foreach ($matches[1] as $url) {
169+
$links[] = $url;
170+
}
171+
}
172+
}
173+
}
174+
175+
//Rimuovi i link che non hanno https o http
176+
$links = array_filter($links, fn($link) =>
177+
preg_match('#^(https?://|www\.)#i', $link)
178+
);
179+
180+
if (!empty($domainWhitelist)) {
181+
$filtered = [];
182+
foreach ($links as $href) {
183+
foreach ($domainWhitelist as $domain) {
184+
if (stripos($href, $domain) !== false) {
185+
$filtered[] = $href;
186+
break;
187+
}
188+
}
189+
}
190+
} else {
191+
$filtered = $links;
192+
}
193+
194+
$this->sourceList = array_values(array_unique($filtered));
195+
196+
return $this;
197+
}
198+
116199
/**
117200
* Check if the content is parsed
118201
*

tests/ReadabilityTest.php

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
<?php
22

33
use The3LabsTeam\Readability\Facades\Readability;
4+
use The3LabsTeam\Readability\Readability as ReadabilityClass;
45

56
it('cannot parse a non html', function () {
67
$html = 'This is a test';
@@ -9,31 +10,65 @@
910
})->toThrow(Exception::class);
1011
});
1112

13+
it('cannot parse a non html from ReadabilityClass', function () {
14+
$html = 'This is a test';
15+
expect(function () use ($html) {
16+
$readability = new ReadabilityClass($html);
17+
$readability->parse();
18+
})->toThrow(Exception::class);
19+
});
20+
1221
it('can parse and get the title', function () {
1322
$html = file_get_contents(__DIR__.'/fixtures/demo.html');
1423
$parsed = Readability::parse($html);
1524
$title = $parsed->getTitle();
1625
expect($title)->toBe('Bitcoin: A Peer-to-Peer Electronic Cash System');
1726
});
1827

28+
it('can parse and get the title from ReadabilityClass', function () {
29+
$html = file_get_contents(__DIR__.'/fixtures/demo.html');
30+
$parsed = (new ReadabilityClass($html))->parse();
31+
$title = $parsed->getTitle();
32+
expect($title)->toBe('Bitcoin: A Peer-to-Peer Electronic Cash System');
33+
});
34+
35+
1936
it('can parse and get excerpt', function () {
2037
$html = file_get_contents(__DIR__.'/fixtures/demo.html');
2138
$excerpt = Readability::parse($html)->getExcerpt();
2239
expect($excerpt)->toBe('A purely peer-to-peer version of electronic cash would allow online payments to be sent directly from one party to another without going through a financial institution.');
2340
});
2441

42+
it('can parse and get excerpt from ReadabilityClass', function () {
43+
$html = file_get_contents(__DIR__.'/fixtures/demo.html');
44+
$excerpt = (new ReadabilityClass($html))->parse()->getExcerpt();
45+
expect($excerpt)->toBe('A purely peer-to-peer version of electronic cash would allow online payments to be sent directly from one party to another without going through a financial institution.');
46+
});
47+
2548
it('can parse and get the author', function () {
2649
$html = file_get_contents(__DIR__.'/fixtures/demo.html');
2750
$author = Readability::parse($html)->getAuthor();
2851
expect($author)->toBe('Satoshi Nakamoto');
2952
});
3053

54+
it('can parse and get the author from ReadabilityClass', function () {
55+
$html = file_get_contents(__DIR__.'/fixtures/demo.html');
56+
$author = (new ReadabilityClass($html))->parse()->getAuthor();
57+
expect($author)->toBe('Satoshi Nakamoto');
58+
});
59+
3160
it('can parse and get the image', function () {
3261
$html = file_get_contents(__DIR__.'/fixtures/demo.html');
3362
$image = Readability::parse($html)->getImage();
3463
expect($image)->toBe('https://www.bitcoin.com/wp-content/uploads/2020/10/bitcoin-whitepaper-featured-image.jpg');
3564
});
3665

66+
it('can parse and get the image from ReadabilityClass', function () {
67+
$html = file_get_contents(__DIR__.'/fixtures/demo.html');
68+
$image = (new ReadabilityClass($html))->parse()->getImage();
69+
expect($image)->toBe('https://www.bitcoin.com/wp-content/uploads/2020/10/bitcoin-whitepaper-featured-image.jpg');
70+
});
71+
3772
it('can parse and get images', function () {
3873
$html = file_get_contents(__DIR__.'/fixtures/demo.html');
3974
$images = Readability::parse($html)->getImages();
@@ -48,15 +83,78 @@
4883
->and($images)->toContain('/static/img/bitcoin/privacy.svg');
4984
});
5085

86+
it('can parse and get images from ReadabilityClass', function () {
87+
$html = file_get_contents(__DIR__.'/fixtures/demo.html');
88+
$images = (new ReadabilityClass($html))->parse()->getImages();
89+
expect($images)->toBeArray()->toHaveCount(8)
90+
->and($images)->toContain('https://www.bitcoin.com/wp-content/uploads/2020/10/bitcoin-whitepaper-featured-image.jpg')
91+
->and($images)->toContain('/static/img/bitcoin/transactions.svg')
92+
->and($images)->toContain('/static/img/bitcoin/timestamp-server.svg')
93+
->and($images)->toContain('/static/img/bitcoin/proof-of-work.svg')
94+
->and($images)->toContain('/static/img/bitcoin/reclaiming-disk-space.svg')
95+
->and($images)->toContain('/static/img/bitcoin/simplified-payment-verification.svg')
96+
->and($images)->toContain('/static/img/bitcoin/combining-splitting-value.svg')
97+
->and($images)->toContain('/static/img/bitcoin/privacy.svg');
98+
});
99+
51100
it('can parse and get the direction', function () {
52101
$html = file_get_contents(__DIR__.'/fixtures/demo.html');
53102
$direction = Readability::parse($html)->getDirection();
54103
expect($direction)->toBe('ltr');
55104
});
56105

106+
it('can parse and get the direction from ReadabilityClass', function () {
107+
$html = file_get_contents(__DIR__.'/fixtures/demo.html');
108+
$direction = (new ReadabilityClass($html))->parse()->getDirection();
109+
expect($direction)->toBe('ltr');
110+
});
111+
57112
it('can parse and get the content', function () {
58113
$html = file_get_contents(__DIR__.'/fixtures/demo.html');
59114
$readability = Readability::parse($html);
60115
$readability->getContent();
61116
expect($readability)->not->toBeNull();
62117
});
118+
119+
it('can parse and get the content from ReadabilityClass', function () {
120+
$html = file_get_contents(__DIR__.'/fixtures/demo.html');
121+
$readability = (new ReadabilityClass($html))->parse();
122+
$content = $readability->getContent();
123+
expect($content)->not->toBeNull();
124+
expect($content)->toContain('A purely peer-to-peer version of electronic cash would allow online payments to be sent directly from one party to another without going through a financial institution.');
125+
});
126+
127+
it('can parse and get the content with source list', function () {
128+
$html = file_get_contents(__DIR__.'/fixtures/demo-with-iframe.html');
129+
$content = (new ReadabilityClass($html))->addSourceList(['facebook.com'])->parse()->getContent();
130+
expect($content)->not->toBeNull();
131+
expect($content)->toContain('A purely peer-to-peer version of electronic cash would allow online payments to be sent directly from one party to another without going through a financial institution.');
132+
expect($content)->toContain('https://www.facebook.com/v3.2/plugins/post');
133+
});
134+
135+
it('can parse and get the content with source list with specific URL', function () {
136+
$html = file_get_contents(__DIR__.'/fixtures/demo-with-iframe.html');
137+
$content = (new ReadabilityClass($html))->addSourceList(['facebook.com/v3.2/plugins'])->parse()->getContent();
138+
expect($content)->not->toBeNull();
139+
expect($content)->toContain('A purely peer-to-peer version of electronic cash would allow online payments to be sent directly from one party to another without going through a financial institution.');
140+
expect($content)->toContain('https://www.facebook.com/v3.2/plugins/post');
141+
expect($content)->not->toContain('https://www.facebook.com/posts/');
142+
});
143+
144+
145+
it('can parse and get the content without source list', function () {
146+
$html = file_get_contents(__DIR__.'/fixtures/demo.html');
147+
$content = (new ReadabilityClass($html))->addSourceList(['facebook.com'])->parse()->getContent();
148+
expect($content)->not->toBeNull();
149+
expect($content)->toContain('A purely peer-to-peer version of electronic cash would allow online payments to be sent directly from one party to another without going through a financial institution.');
150+
expect($content)->not->toContain('https://www.facebook.com/v3.2/plugins/post');
151+
});
152+
153+
it('can parse and get the content with all source list', function () {
154+
$html = file_get_contents(__DIR__.'/fixtures/demo-with-iframe.html');
155+
$content = (new ReadabilityClass($html))->addSourceList()->parse()->getContent();
156+
expect($content)->not->toBeNull();
157+
expect($content)->toContain('https://satoshi.nakamotoinstitute.org');
158+
expect($content)->toContain('https://www.facebook.com/v3.2/plugins/post');
159+
expect($content)->toContain('https://www.bitcoin.com/wp-content/uploads/2020/10/bitcoin-whitepaper-featured-image.jpg');
160+
});

0 commit comments

Comments
 (0)