Filtering by Number of Sentences
Filtering by Number of Sentences
NLP divides a source text into sentences. The following example filters out sources that contain less than 75 sentences. It returns the total number of sources, then uses the filter to return the total number of sources containing 75 or more sentences, then uses the filter again to return the number of sentences in each of these sources:
DomainCreateOrOpen
SET dname="mydomain"
IF (##class(%iKnow.Domain).NameIndexExists(dname))
{ SET domoref=##class(%iKnow.Domain).NameIndexOpen(dname)
GOTO DeleteOldData }
ELSE { SET domoref=##class(%iKnow.Domain).%New(dname)
DO domoref.%Save()
GOTO SetEnvironment }
DeleteOldData
SET stat=domoref.DropData()
IF stat { GOTO SetEnvironment }
ELSE { WRITE "DropData error ",$System.Status.DisplayError(stat)
QUIT}
SetEnvironment
SET domId=domoref.Id
IF ##class(%iKnow.Configuration).Exists("myconfig") {
SET cfg=##class(%iKnow.Configuration).Open("myconfig") }
ELSE { SET cfg=##class(%iKnow.Configuration).%New("myconfig",0,$LISTBUILD("en"),"",1)
DO cfg.%Save() }
ListerAndLoader
SET domId=domoref.Id
SET flister=##class(%iKnow.Source.SQL.Lister).%New(domId)
SET myloader=##class(%iKnow.Source.Loader).%New(domId)
QueryBuild
SET myquery="SELECT TOP 50 ID AS UniqueVal,Type,NarrativeFull FROM Aviation.Event"
SET idfld="UniqueVal"
SET grpfld="Type"
SET dataflds=$LB("NarrativeFull")
UseListerAndLoader
SET stat=flister.AddListToBatch(myquery,idfld,grpfld,dataflds)
IF stat '= 1 {WRITE "The lister failed: ",$System.Status.DisplayError(stat) QUIT }
SET stat=myloader.ProcessBatch()
IF stat '= 1 {WRITE "The loader failed: ",$System.Status.DisplayError(stat) QUIT }
DefineAFilter
SET filt=##class(%iKnow.Filters.SentenceCountFilter).%New(domId)
SET nsent=75
DO filt.MinSentenceCountSet(nsent)
SourceSentenceQueries
SET numSrcD=##class(%iKnow.Queries.SourceAPI).GetCountByDomain(domId)
WRITE "The domain contains ",numSrcD," sources",!
SET numSrcFD=##class(%iKnow.Queries.SourceAPI).GetCountByDomain(domId,filt)
WRITE "Of these ",numSrcD," sources ",numSrcFD," contain ",nsent," or more sentences:",!
DO ##class(%iKnow.Queries.SourceAPI).GetByDomain(.result,domId,1,50,filt)
SET i=1
WHILE $DATA(result(i)) {
SET numSentS = ##class(%iKnow.Queries.SentenceAPI).GetCountBySource(domId,result(i))
SET intId = $LISTGET(result(i),1)
SET extId = $LISTGET(result(i),2)
WRITE "source ",intId," ",extId
WRITE " has ",numSentS," sentences",!
SET i=i+1 }
WRITE i-1," sources listed"
To filter using both minimum and maximum number of sentences, invoke both instance methods. The following filter selects those sources containing between 10 and 25 sentences (inclusive):
MinMaxFilter
SET min=10
SET filt=##class(%iKnow.Filters.SentenceCountFilter).%New(domId)
DO filt.MinSentenceCountSet(min)
DO filt.MaxSentenceCountSet(min+15)