Skip to content

Commit cc73622

Browse files
committed
Add some hard-coded patterns for cleaning up very long titles and values
1 parent 32b7873 commit cc73622

File tree

1 file changed

+19
-3
lines changed

1 file changed

+19
-3
lines changed

main.go

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import (
1010
"io"
1111
"log"
1212
"os"
13+
"regexp"
1314
str "strings"
1415
"time"
1516

@@ -439,16 +440,21 @@ const (
439440
// Code -----------------------------------------------------------------------
440441

441442
type TripleAggregateToWikiPageConverter struct {
442-
InAggregate chan *TripleAggregate
443-
InIndex chan *map[string]*TripleAggregate
444-
OutPage chan *WikiPage
443+
InAggregate chan *TripleAggregate
444+
InIndex chan *map[string]*TripleAggregate
445+
OutPage chan *WikiPage
446+
cleanUpRegexes []*regexp.Regexp
445447
}
446448

447449
func NewTripleAggregateToWikiPageConverter() *TripleAggregateToWikiPageConverter {
448450
return &TripleAggregateToWikiPageConverter{
449451
InAggregate: make(chan *TripleAggregate, BUFSIZE),
450452
InIndex: make(chan *map[string]*TripleAggregate, BUFSIZE),
451453
OutPage: make(chan *WikiPage, BUFSIZE),
454+
cleanUpRegexes: []*regexp.Regexp{
455+
regexp.MustCompile(" [(][^)]*:[^)]*[)]"),
456+
regexp.MustCompile(" [[][^]]*:[^]]*[]]"),
457+
},
452458
}
453459
}
454460

@@ -488,6 +494,11 @@ func (p *TripleAggregateToWikiPageConverter) Run() {
488494
} else if tr.Obj.Type() == rdf.TermLiteral {
489495

490496
valueStr = tr.Obj.String()
497+
498+
for _, r := range p.cleanUpRegexes {
499+
valueStr = r.ReplaceAllString(valueStr, "")
500+
}
501+
491502
dataTypeStr := tr.Obj.(rdf.Literal).DataType.String()
492503

493504
// Add type info on the current property's page
@@ -590,6 +601,11 @@ func (p *TripleAggregateToWikiPageConverter) convertUriToWikiTitle(uri string, u
590601
factTitle = str.Replace(factTitle, "]", ")", -1)
591602
factTitle = html.EscapeString(factTitle)
592603

604+
// Clean up according to regexes
605+
for _, r := range p.cleanUpRegexes {
606+
factTitle = r.ReplaceAllString(factTitle, "")
607+
}
608+
593609
// Limit to max 255 chars (due to MediaWiki limitaiton)
594610
titleIsShortened := false
595611
for len(factTitle) >= 250 {

0 commit comments

Comments
 (0)