diff --git a/README.md b/README.md index bd7df3f..8628a1f 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ Unlike its Ruby Counterparts, `clj-libil` is only implemented as library. Just include this to fetch it from Clojars ```clojure -[clj-libil "0.1.0"] +[clj-libil "0.1.1"] ``` ## Usage @@ -38,6 +38,17 @@ There are 4 functions to convert word and sentences using `clj-libil` (convert-sentence-ngalam "Ngalup Ayabarus") ;; Pulang Surabaya ``` +## Release Notes + +### Version 0.1.1 + +- Using `StringReader` to process token. +- Using transient collection for optimisation. + +### Version 0.1.0 + +- Initial Version + ## License The MIT License (MIT) diff --git a/project.clj b/project.clj index d4122e2..872e031 100644 --- a/project.clj +++ b/project.clj @@ -1,4 +1,4 @@ -(defproject clj-libil "0.1.0" +(defproject clj-libil "0.1.1" :description "Clojure port of Libil, Processor of Bahasa Walikan" :url "http://github.com/lynxluna/clj-libil" :license {:name "MIT License" diff --git a/src/libil/core.clj b/src/libil/core.clj index 2f769c3..d647487 100644 --- a/src/libil/core.clj +++ b/src/libil/core.clj @@ -1,5 +1,6 @@ (ns libil.core - (:use [clojure.string :only [split lower-case upper-case capitalize join]])) + (:use [clojure.string :only [split lower-case upper-case capitalize join]]) + (:import [java.io Reader StringReader])) (def first-pair ["h" "n" "c" "r" "k" "d" "t" "s" "w" "l"]) @@ -13,15 +14,32 @@ (defn- within? [coll item] ((complement nil?) (some (set [item]) coll))) +(defn- rdr-peek + [^Reader rdr] + (.mark rdr 1) + (let [c (.read rdr)] + (.reset rdr) + c)) + +(defn tokenize-rdr + "Tokenize a reader" + [^Reader rdr] + (loop [tokens (transient []) + current (.read rdr) + ahead (rdr-peek rdr)] + (let [cc (-> current char str)] + (cond (== -1 ahead) (persistent! (conj! tokens cc)) + (within? all-con (lower-case (str cc (char ahead)))) + (let [pair (str (char current) (char ahead))] + (.skip rdr 1) + (if (== -1 (rdr-peek rdr)) (persistent! (conj! tokens pair)) + (recur (conj! tokens pair) (.read rdr) (rdr-peek rdr)))) + :else (recur (conj! tokens cc) (.read rdr) (rdr-peek rdr)))))) + (defn tokenize-word "Tokenizing the word, to be able to be mapped" [^String w] - (loop [l [] rstr w] - (let [pair (apply str (take 2 rstr)) - fstr (str (first rstr))] - (cond (empty? rstr) l - (within? all-con (lower-case pair)) (recur (conj l pair) (apply str (-> rstr rest rest))) - :else (recur (conj l fstr) (apply str (rest rstr))))))) + (tokenize-rdr (StringReader. w))) (defn- inv-cap "Inverse Capitalize"