diff --git a/src/core/config/Categories.json b/src/core/config/Categories.json index d3b30077..4f1b3328 100644 --- a/src/core/config/Categories.json +++ b/src/core/config/Categories.json @@ -334,7 +334,8 @@ "CSS selector", "Extract EXIF", "Extract ID3", - "Extract Files" + "Extract Files", + "RAKE" ] }, { diff --git a/src/core/operations/RAKE.mjs b/src/core/operations/RAKE.mjs new file mode 100644 index 00000000..d1165b51 --- /dev/null +++ b/src/core/operations/RAKE.mjs @@ -0,0 +1,149 @@ +/** + * @author sw5678 + * @copyright Crown Copyright 2024 + * @license Apache-2.0 + */ + +import Operation from "../Operation.mjs"; + +/** + * RAKE operation + */ +class RAKE extends Operation { + + /** + * RAKE constructor + */ + constructor() { + super(); + + this.name = "RAKE"; + this.module = "Default"; + this.description = [ + "Rapid Keyword Extraction (RAKE)", + "

", + "RAKE is a domain-independent keyword extraction algorithm in Natural Language Processing.", + "

", + "The list of stop words are from the NLTK python package", + ].join("\n"); + this.inputType = "string"; + this.outputType = "string"; + this.args = [ + { + name: "Word Delimiter (Regex)", + type: "text", + value: "\\s" + }, + { + name: "Sentence Delimiter (Regex)", + type: "text", + value: "\\.\\s|\\n" + }, + { + name: "Stop Words", + type: "text", + value: "i,me,my,myself,we,our,ours,ourselves,you,you're,you've,you'll,you'd,your,yours,yourself,yourselves,he,him,his,himself,she,she's,her,hers,herself,it,it's,its,itsef,they,them,their,theirs,themselves,what,which,who,whom,this,that,that'll,these,those,am,is,are,was,were,be,been,being,have,has,had,having,do,does',did,doing,a,an,the,and,but,if,or,because,as,until,while,of,at,by,for,with,about,against,between,into,through,during,before,after,above,below,to,from,up,down,in,out,on,off,over,under,again,further,then,once,here,there,when,where,why,how,all,any,both,each,few,more,most,other,some,such,no,nor,not,only,own,same,so,than,too,very,s,t,can,will,just,don,don't,should,should've,now,d,ll,m,o,re,ve,y,ain,aren,aren't,couldn,couldn't,didn,didn't,doesn,doesn't,hadn,hadn't,hasn,hasn't,haven,haven't,isn,isn't,ma,mightn,mightn't,mustn,mustn't,needn,needn't,shan,shan't,shouldn,shouldn't,wasn,wasn't,weren,weren't,won,won't,wouldn,wouldn't" + } + ]; + } + + /** + * @param {string} input + * @param {Object[]} args + * @returns {string} + */ + run(input, args) { + + // Get delimiter regexs + const wordDelim = new RegExp(args[0], "g"); + const sentDelim = new RegExp(args[1], "g"); + + // Deduplicate the stop words and add the empty string + const stopWords = args[2].toLowerCase().replace(/ /g, "").split(",").unique(); + stopWords.push(""); + + // Lower case input and remove start and ending whitespace + input = input.toLowerCase().trim(); + + // Get tokens, token count, and phrases + const tokens = []; + const wordFrequencies = []; + let phrases = []; + + // Build up list of phrases and token counts + const sentences = input.split(sentDelim); + for (const sent of sentences) { + + // Split sentence into words + const splitSent = sent.split(wordDelim); + let startIndex = 0; + + for (let i = 0; i < splitSent.length; i++) { + const token = splitSent[i]; + if (stopWords.includes(token)) { + // If token is stop word then split to create phrase + phrases.push(splitSent.slice(startIndex, i)); + startIndex = i + 1; + } else { + // If token is not a stop word add to the count of the list of words + if (tokens.includes(token)) { + wordFrequencies[tokens.indexOf(token)]+=1; + } else { + tokens.push(token); + wordFrequencies.push(1); + } + } + } + phrases.push(splitSent.slice(startIndex)); + } + + // remove empty phrases + phrases = phrases.filter(subArray => subArray.length > 0); + + // Remove duplicate phrases + const uniquePhrases = [...new Set(phrases.map(function (phrase) { + return phrase.join(" "); + }))]; + phrases = uniquePhrases.map(function (phrase) { + return phrase.split(" "); + }); + + // Generate word_degree_matrix and populate + const wordDegreeMatrix = Array.from(Array(tokens.length), _ => Array(tokens.length).fill(0)); + phrases.forEach(function (phrase) { + phrase.forEach(function (word1) { + phrase.forEach(function (word2) { + wordDegreeMatrix[tokens.indexOf(word1)][tokens.indexOf(word2)]++; + }); + }); + }); + + // Calculate degree score for each token + const degreeScores = Array(tokens.length).fill(0); + for (let i=0; i b[0] - a[0]); + scores.unshift(new Array("Scores: ", "Keywords: ")); + + // Output works with the 'To Table' functionality already built into CC + return scores.map(function (score) { + return score.join(", "); + }).join("\n"); + } +} + +export default RAKE; diff --git a/tests/operations/index.mjs b/tests/operations/index.mjs index 757d6e9d..e85b6ad3 100644 --- a/tests/operations/index.mjs +++ b/tests/operations/index.mjs @@ -118,6 +118,7 @@ import "./tests/PHP.mjs"; import "./tests/PowerSet.mjs"; import "./tests/Protobuf.mjs"; import "./tests/Rabbit.mjs"; +import "./tests/RAKE.mjs"; import "./tests/Regex.mjs"; import "./tests/Register.mjs"; import "./tests/RisonEncodeDecode.mjs"; diff --git a/tests/operations/tests/RAKE.mjs b/tests/operations/tests/RAKE.mjs new file mode 100644 index 00000000..8164ca01 --- /dev/null +++ b/tests/operations/tests/RAKE.mjs @@ -0,0 +1,22 @@ +/** + * RAKE, Rapid Automatic Keyword Extraction tests. + * + * @author sw5678 + * @copyright Crown Copyright 2024 + * @license Apache-2.0 + */ +import TestRegister from "../../lib/TestRegister.mjs"; + +TestRegister.addTests([ + { + "name": "RAKE: Basic Example", + "input": "test1 test2. test2", + "expectedOutput": "Scores: , Keywords: \n3.5, test1 test2\n1.5, test2", + "recipeConfig": [ + { + "op": "RAKE", + "args": ["\\s", "\\.\\s|\\n", "i,me,my,myself,we,our"] + }, + ], + } +]);