| 1 |
class String |
|---|
| 2 |
# Creates a "probably unique" id with the desired length, composed only of lowercase letters. |
|---|
| 3 |
def base26_hash(length=Picolena::HashLength) |
|---|
| 4 |
Digest::MD5.hexdigest(self).to_i(16).to_s(26).tr('0-9a-p', 'a-z')[-length,length] |
|---|
| 5 |
end |
|---|
| 6 |
end |
|---|
| 7 |
|
|---|
| 8 |
module Enumerable |
|---|
| 9 |
# Similar to Enumerable#each, but creates a new thread for each element. |
|---|
| 10 |
# Used for the indexer to make it multi-threaded. |
|---|
| 11 |
# It ensures that threads are joined together before returning. |
|---|
| 12 |
def each_with_thread(&block) |
|---|
| 13 |
tds=self.collect{|elem| |
|---|
| 14 |
Thread.new(elem) {|elem| |
|---|
| 15 |
block.call(elem) |
|---|
| 16 |
} |
|---|
| 17 |
} |
|---|
| 18 |
tds.each{|aThread| aThread.join} |
|---|
| 19 |
end |
|---|
| 20 |
end |
|---|
| 21 |
|
|---|
| 22 |
class Array |
|---|
| 23 |
# Returns a partition of n arrays. |
|---|
| 24 |
# Transposition is used to avoid getting arrays that are too different. |
|---|
| 25 |
# >> (0..17).to_a.in_transposed_slices(5) |
|---|
| 26 |
# => [[0, 5, 10, 15], [1, 6, 11, 16], [2, 7, 12, 17], [3, 8, 13], [4, 9, 14]] |
|---|
| 27 |
# while |
|---|
| 28 |
# >> (0..17).enum_slice(5).to_a |
|---|
| 29 |
# => [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9], [10, 11, 12, 13, 14], [15, 16, 17]] |
|---|
| 30 |
# |
|---|
| 31 |
# If some folders contain big files and some others contain small ones, |
|---|
| 32 |
# every indexing thread will get some of both! |
|---|
| 33 |
def in_transposed_slices(n) |
|---|
| 34 |
# no need to compute anything if n==1 |
|---|
| 35 |
return [self] if n==1 |
|---|
| 36 |
# Array#transpose would raise if Array is not a square array of arrays. |
|---|
| 37 |
i=n-self.size%n |
|---|
| 38 |
# Adds nils so that size is a multiple of n, |
|---|
| 39 |
# cuts array in slices of size n, |
|---|
| 40 |
# transposes to get n slices, |
|---|
| 41 |
# and removes added nils. |
|---|
| 42 |
(self+[nil]*i).enum_slice(n).to_a.transpose.collect{|e| e.compact} |
|---|
| 43 |
end |
|---|
| 44 |
end |
|---|
| 45 |
|
|---|
| 46 |
class Hash |
|---|
| 47 |
def add(category) |
|---|
| 48 |
self[category]||={:size=>0} |
|---|
| 49 |
self[category][:size]+=1 |
|---|
| 50 |
end |
|---|
| 51 |
end |
|---|
| 52 |
|
|---|
| 53 |
class File |
|---|
| 54 |
# Returns the filetype of filename as a symbol. |
|---|
| 55 |
# Returns :no_extension unless an extension is found |
|---|
| 56 |
# >> File.ext_as_sym("test.pdf") |
|---|
| 57 |
# => :pdf |
|---|
| 58 |
# >> File.ext_as_sym("test.tar.gz") |
|---|
| 59 |
# => :gz |
|---|
| 60 |
# >> File.ext_as_sym("test") |
|---|
| 61 |
# => :no_extension |
|---|
| 62 |
def self.ext_as_sym(filename) |
|---|
| 63 |
File.extname(filename).sub(,'').downcase.to_sym rescue :no_extension |
|---|
| 64 |
end |
|---|
| 65 |
|
|---|
| 66 |
# Returns a probable encoding for a given plain text file |
|---|
| 67 |
# If source is a html file, it parses for metadata to retrieve encoding, |
|---|
| 68 |
# and uses file -i otherwise. |
|---|
| 69 |
# Returns iso-8859-15 instead of iso-8859-1, to be sure ⬠char can be |
|---|
| 70 |
# encoded |
|---|
| 71 |
def self.encoding(source) |
|---|
| 72 |
parse_for_charset="grep -io charset=[a-z0-9\\-]* | sed 's/charset=//i'" |
|---|
| 73 |
if File.extname(source)[0,4]==".htm" then |
|---|
| 74 |
enc=%x{head -n20 \"#{source}\" | #{parse_for_charset}}.chomp |
|---|
| 75 |
else |
|---|
| 76 |
enc=%x{file -i \"#{source}\" | #{parse_for_charset}}.chomp |
|---|
| 77 |
end |
|---|
| 78 |
#iso-8859-15 should be used instead of iso-8859-1, for ⬠char |
|---|
| 79 |
case enc |
|---|
| 80 |
when "iso-8859-1" |
|---|
| 81 |
"iso-8859-15" |
|---|
| 82 |
when "unknown" |
|---|
| 83 |
"" |
|---|
| 84 |
else |
|---|
| 85 |
enc |
|---|
| 86 |
end |
|---|
| 87 |
end |
|---|
| 88 |
|
|---|
| 89 |
# Returns the content of a file and removes it after. |
|---|
| 90 |
# Could be used to read temporary output file written by a PlainTextExtractor. |
|---|
| 91 |
def self.read_and_remove(filename) |
|---|
| 92 |
content=read(filename) |
|---|
| 93 |
FileUtils.rm filename, :force=>true |
|---|
| 94 |
content |
|---|
| 95 |
end |
|---|
| 96 |
|
|---|
| 97 |
# Returns nil unless filename is a plain text file. |
|---|
| 98 |
# It requires file command. |
|---|
| 99 |
# NOTE: What to use for Win32? |
|---|
| 100 |
def self.plain_text?(filename) |
|---|
| 101 |
%x{file -i "#{filename}"} =~ |
|---|
| 102 |
end |
|---|
| 103 |
end |
|---|